In [2]:
import numpy as np 
import pandas as pd 
import scipy

# from subprocess import check_output
# print(check_output(["ls", "../input"]).decode("utf8"))
In [6]:
import warnings
warnings.filterwarnings('ignore')

from IPython.core.display import HTML
from matplotlib import rcParams

import seaborn as sns
import matplotlib.pylab as plt

%matplotlib inline
%config InlineBackend.figure_format = 'retina'
In [3]:
from sklearn.model_selection import train_test_split, ShuffleSplit
from sklearn.model_selection import KFold, ParameterGrid, cross_val_score, GridSearchCV

from sklearn.metrics import mean_squared_error, median_absolute_error, mean_absolute_error
from sklearn.metrics import r2_score, explained_variance_score

from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.ensemble import BaggingRegressor, AdaBoostRegressor, ExtraTreesRegressor

from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor

from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.linear_model import Ridge, RidgeCV, BayesianRidge
from sklearn.linear_model import HuberRegressor, TheilSenRegressor, RANSACRegressor

from sklearn.preprocessing import OneHotEncoder, StandardScaler, RobustScaler, MinMaxScaler
from sklearn.pipeline import Pipeline
In [56]:
import keras as ks
from keras.models import Sequential, load_model, Model

from keras.optimizers import SGD, RMSprop

from keras.layers import Dense, Dropout, LSTM
from keras.layers import Activation, Flatten, Input, BatchNormalization
from keras.layers import Conv1D, MaxPooling1D, Conv2D, MaxPooling2D
from keras.layers.embeddings import Embedding

from keras.wrappers.scikit_learn import KerasRegressor

from keras.callbacks import ModelCheckpoint, EarlyStopping
In [57]:
def regression(regressor, x_train, x_test, y_train):
    reg = regressor
    reg.fit(x_train, y_train)
    
    y_train_reg = reg.predict(x_train)
    y_test_reg = reg.predict(x_test)
    
    return y_train_reg, y_test_reg

def loss_plot(fit_history):
    plt.figure(figsize=(18, 6))

    plt.plot(fit_history.history['loss'], color='#348ABD', label = 'train')
    plt.plot(fit_history.history['val_loss'], color='#228B22', label = 'test')

    plt.legend()
    plt.title('Loss Function');  
    
def mae_plot(fit_history):
    plt.figure(figsize=(18, 6))

    plt.plot(fit_history.history['mean_absolute_error'], color='#348ABD', label = 'train')
    plt.plot(fit_history.history['val_mean_absolute_error'], color='#228B22', label = 'test')

    plt.legend()
    plt.title('Mean Absolute Error');   

def scores(regressor, y_train, y_test, y_train_reg, y_test_reg):
    print("_______________________________________")
    print(regressor)
    print("_______________________________________")
    print("EV score. Train: ", explained_variance_score(y_train, y_train_reg))
    print("EV score. Test: ", explained_variance_score(y_test, y_test_reg))
    print("---------")
    print("R2 score. Train: ", r2_score(y_train, y_train_reg))
    print("R2 score. Test: ", r2_score(y_test, y_test_reg))
    print("---------")
    print("MSE score. Train: ", mean_squared_error(y_train, y_train_reg))
    print("MSE score. Test: ", mean_squared_error(y_test, y_test_reg))
    print("---------")
    print("MAE score. Train: ", mean_absolute_error(y_train, y_train_reg))
    print("MAE score. Test: ", mean_absolute_error(y_test, y_test_reg))
    print("---------")
    print("MdAE score. Train: ", median_absolute_error(y_train, y_train_reg))
    print("MdAE score. Test: ", median_absolute_error(y_test, y_test_reg))
    
def scores2(regressor, target, target_predict):
    print("_______________________________________")
    print(regressor)
    print("_______________________________________")
    print("EV score:", explained_variance_score(target, target_predict))
    print("---------")
    print("R2 score:", r2_score(target, target_predict))
    print("---------")
    print("MSE score:", mean_squared_error(target, target_predict))
    print("---------")
    print("MAE score:", mean_absolute_error(target, target_predict))
    print("---------")
    print("MdAE score:", median_absolute_error(target, target_predict))

1. Problem Statement

Sberbank is challenging programmers to develop algorithms which use a broad spectrum of features to predict real prices. Competitors will rely on a rich dataset that includes housing data and macroeconomic patterns. An accurate forecasting model will allow Sberbank to provide more certainty to their customers in an uncertain economy.


2. Datasets and Inputs

2.1 Description by Files

In [8]:
HTML('''<div id="data">
<p><iframe src="data_dictionary.txt" frameborder="0" height="300"width="97%"></iframe></p>
</div>''')
Out[8]:

2.2 Load and Display the Data

In [16]:
macro = pd.read_csv('macro.csv')
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
In [18]:
macro[100:110].T[1:10]
Out[18]:
100 101 102 103 104 105 106 107 108 109
oil_urals 82.87 82.87 82.87 82.87 82.87 82.87 82.87 82.87 82.87 82.87
gdp_quart 9995.8 9995.8 9995.8 9995.8 9995.8 9995.8 9995.8 9995.8 9995.8 9995.8
gdp_quart_growth 4.1 4.1 4.1 4.1 4.1 4.1 4.1 4.1 4.1 4.1
cpi 319.8 319.8 319.8 319.8 319.8 319.8 319.8 319.8 319.8 319.8
ppi 350.2 350.2 350.2 350.2 350.2 350.2 350.2 350.2 350.2 350.2
gdp_deflator NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
balance_trade 16.604 16.604 16.604 16.604 16.604 16.604 16.604 16.604 16.604 16.604
balance_trade_growth 14.1 14.1 14.1 14.1 14.1 14.1 14.1 14.1 14.1 14.1
usdrub 29.1525 29.0261 29.1 28.9194 29.0239 29.092 29.092 29.092 29.1835 29.1398
In [19]:
train[200:210].T[1:10]
Out[19]:
200 201 202 203 204 205 206 207 208 209
timestamp 2011-10-25 2011-10-25 2011-10-25 2011-10-25 2011-10-26 2011-10-26 2011-10-26 2011-10-26 2011-10-26 2011-10-26
full_sq 38 33 30 76 44 35 72 32 84 45
life_sq 19 14 18 51 29 21 45 18 43 26
floor 15 8 3 2 8 5 10 6 21 5
max_floor NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
material NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
build_year NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
num_room NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
kitch_sq NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

3. Solution Statement

3.1 Feature Selection

In [29]:
X_list_num = ['timestamp',
              'full_sq', 'num_room', 'area_m', 
              'kremlin_km', 'big_road2_km', 'big_road1_km',
              'workplaces_km',
              'stadium_km', 'swim_pool_km', 'fitness_km', 
              'detention_facility_km', 'cemetery_km',
              'radiation_km', 'oil_chemistry_km',
              'theater_km', 'exhibition_km', 'museum_km', 
              'park_km', 'public_healthcare_km',  
              'metro_min_walk','metro_km_avto', 
              'bus_terminal_avto_km', 'public_transport_station_min_walk',
              'railroad_station_walk_min', 'railroad_station_avto_km',
              'kindergarten_km', 'school_km', 'preschool_km',
              'university_km', 'additional_education_km',
              'shopping_centers_km', 'big_market_km',
              'ekder_all', 'work_all', 'young_all']

X_list_cat = ['sub_area', 'ID_metro', 
              'office_raion', 'sport_objects_raion',
              'raion_popul', 'healthcare_centers_raion',
              'school_education_centers_raion', 
              'preschool_education_centers_raion']

target_train = train['price_doc']
In [30]:
plt.style.use('seaborn-whitegrid')
f, (ax1, ax2) = plt.subplots(ncols=2, figsize=(18, 6))

sns.distplot(target_train, bins=200, color='#228B22', ax=ax1)
ax1.set_xlabel("Prices")

sns.distplot(np.log(target_train), bins=200, color='#228B22', ax=ax2)
ax2.set_xlabel("Logarithm of the variable 'Prices'")

plt.suptitle('Sberbank Russian Housing Data');
In [24]:
print ("Sberbank Russian Housing Dataset Statistics: \n")
print ("Number of houses = ", len(target_train))
print ("Number of features = ", len(list(features_train.keys())))
print ("Minimum house price = ", np.min(target_train))
print ("Maximum house price = ", np.max(target_train))
print ("Mean house price = ", "%.2f" % np.mean(target_train))
print ("Median house price = ", "%.2f" % np.median(target_train))
print ("Standard deviation of house prices =", "%.2f" % np.std(target_train))
Sberbank Russian Housing Dataset Statistics: 

Number of houses =  30471
Number of features =  48
Minimum house price =  100000
Maximum house price =  111111112
Mean house price =  7123035.28
Median house price =  6274411.00
Standard deviation of house prices = 4780032.89

3.2 Fill in Missing Values

In [25]:
features_train.isnull().sum()
Out[25]:
full_sq                                 0
num_room                             9572
floor                                 167
area_m                                  0
timestamp                               0
preschool_education_centers_raion       0
school_education_centers_raion          0
children_preschool                      0
children_school                         0
shopping_centers_raion                  0
healthcare_centers_raion                0
office_raion                            0
sport_objects_raion                     0
public_transport_station_min_walk       0
railroad_station_walk_min              25
railroad_station_avto_km                0
bus_terminal_avto_km                    0
cafe_count_500                          0
kremlin_km                              0
workplaces_km                           0
ID_metro                                0
metro_km_avto                           0
metro_min_walk                         25
public_healthcare_km                    0
shopping_centers_km                     0
big_market_km                           0
fitness_km                              0
swim_pool_km                            0
stadium_km                              0
park_km                                 0
kindergarten_km                         0
school_km                               0
preschool_km                            0
university_km                           0
additional_education_km                 0
theater_km                              0
exhibition_km                           0
museum_km                               0
big_road1_km                            0
big_road2_km                            0
detention_facility_km                   0
cemetery_km                             0
oil_chemistry_km                        0
radiation_km                            0
raion_popul                             0
work_all                                0
young_all                               0
ekder_all                               0
dtype: int64
In [26]:
features_test.isnull().sum()
Out[26]:
full_sq                               0
num_room                              0
floor                                 0
area_m                                0
timestamp                             0
preschool_education_centers_raion     0
school_education_centers_raion        0
children_preschool                    0
children_school                       0
shopping_centers_raion                0
healthcare_centers_raion              0
office_raion                          0
sport_objects_raion                   0
public_transport_station_min_walk     0
railroad_station_walk_min            34
railroad_station_avto_km              0
bus_terminal_avto_km                  0
cafe_count_500                        0
kremlin_km                            0
workplaces_km                         0
ID_metro                              0
metro_km_avto                         0
metro_min_walk                       34
public_healthcare_km                  0
shopping_centers_km                   0
big_market_km                         0
fitness_km                            0
swim_pool_km                          0
stadium_km                            0
park_km                               0
kindergarten_km                       0
school_km                             0
preschool_km                          0
university_km                         0
additional_education_km               0
theater_km                            0
exhibition_km                         0
museum_km                             0
big_road1_km                          0
big_road2_km                          0
detention_facility_km                 0
cemetery_km                           0
oil_chemistry_km                      0
radiation_km                          0
raion_popul                           0
work_all                              0
young_all                             0
ekder_all                             0
dtype: int64
In [31]:
df_train = pd.DataFrame(train, columns=X_list_num)
df_train_cat = pd.DataFrame(train, columns=X_list_num+X_list_cat)

df_test = pd.DataFrame(test, columns=X_list_num)
df_test_cat = pd.DataFrame(test, columns=X_list_num+X_list_cat)

df_train['prices'] = target_train
df_train_cat['prices'] = target_train

df_train = df_train.dropna(subset=['num_room'])
df_train_cat = df_train_cat.dropna(subset=['num_room'])

df_train['metro_min_walk'] = \
df_train['metro_min_walk'].interpolate(method='linear')
df_train_cat['metro_min_walk'] = \
df_train_cat['metro_min_walk'].interpolate(method='linear')

df_train['railroad_station_walk_min'] = \
df_train['railroad_station_walk_min'].interpolate(method='linear')
df_train_cat['railroad_station_walk_min'] = \
df_train_cat['railroad_station_walk_min'].interpolate(method='linear')

df_test['metro_min_walk'] = \
df_test['metro_min_walk'].interpolate(method='linear')
df_test_cat['metro_min_walk'] = \
df_test_cat['metro_min_walk'].interpolate(method='linear')

df_test['railroad_station_walk_min'] = \
df_test['railroad_station_walk_min'].interpolate(method='linear')
df_test_cat['railroad_station_walk_min'] = \
df_test_cat['railroad_station_walk_min'].interpolate(method='linear')

len(df_train)
Out[31]:
20899

3.3 Categorical and Macro Features

In [32]:
# Add the Macro Feature
usdrub_pairs = dict(zip(list(macro['timestamp']), list(macro['usdrub'])))
# salary_pairs = dict(zip(list(macro['timestamp']), list(macro['salary'])))

df_train['timestamp'].replace(usdrub_pairs,inplace=True)
df_train_cat['timestamp'].replace(usdrub_pairs,inplace=True)

df_test['timestamp'].replace(usdrub_pairs,inplace=True)
df_test_cat['timestamp'].replace(usdrub_pairs,inplace=True)

df_train.rename(columns={'timestamp' : 'usdrub'}, inplace=True)
df_train_cat.rename(columns={'timestamp' : 'usdrub'}, inplace=True)

df_test.rename(columns={'timestamp' : 'usdrub'}, inplace=True)
df_test_cat.rename(columns={'timestamp' : 'usdrub'}, inplace=True)
In [33]:
# Preprocess Categorical Features
for df in [df_train_cat, df_test_cat]:
    print ("____________________________________________")
    
    print('sub area')
    print('Number of categories:', len(set(df['sub_area'])))
    print(set(df['sub_area']))

    print('\nID metro')
    print('Number of categories:', len(set(df['ID_metro'])))
    print(set(df['ID_metro']))

    print('\noffice raion')
    print('Number of categories:', len(set(df['office_raion'])))
    print(set(df['office_raion']))

    print('\nsport objects raion')
    print('Number of categories:', len(set(df['sport_objects_raion'])))
    print(set(df_train_cat['sport_objects_raion']))

    print('\nraion popul')
    print('Number of categories:', len(set(df['raion_popul'])))
    print(set(df['raion_popul']))

    print('\nhealthcare centers raion')
    print('Number of categories:', len(set(df_train_cat['healthcare_centers_raion'])))
    print(set(df['healthcare_centers_raion']))

    print('\nschool education centers raion')
    print('Number of categories:', len(set(df['school_education_centers_raion'])))
    print(set(df['school_education_centers_raion']))

    print('\npreschool education centers raion')
    print('Number of categories:', len(set(df['preschool_education_centers_raion'])))
    print(set(df['preschool_education_centers_raion']))
____________________________________________
sub area
Number of categories: 146
{'Beskudnikovskoe', 'Danilovskoe', 'Nizhegorodskoe', 'Nagatinskij Zaton', 'Poselenie Shhapovskoe', "Altuf'evskoe", 'Krjukovo', 'Cheremushki', "Chertanovo Central'noe", 'Babushkinskoe', "Mar'ino", 'Otradnoe', 'Nagatino-Sadovniki', 'Poselenie Shherbinka', 'Koptevo', 'Zapadnoe Degunino', 'Poselenie Rogovskoe', 'Sokol', 'Donskoe', 'Ramenki', 'Poselenie Pervomajskoe', 'Novo-Peredelkino', 'Poselenie Rjazanovskoe', 'Ivanovskoe', "Zamoskvorech'e", "Moskvorech'e-Saburovo", 'Sviblovo', 'Zjablikovo', 'Rjazanskij', 'Poselenie Novofedorovskoe', 'Krylatskoe', 'Birjulevo Vostochnoe', 'Hovrino', 'Ostankinskoe', 'Molzhaninovskoe', 'Poselenie Voskresenskoe', 'Vnukovo', 'Dmitrovskoe', 'Chertanovo Juzhnoe', 'Perovo', 'Kuncevo', "Kon'kovo", 'Kurkino', 'Bogorodskoe', 'Ljublino', 'Troickij okrug', 'Vojkovskoe', 'Severnoe', 'Bibirevo', 'Marfino', 'Savelki', 'Nagornoe', 'Vostochnoe Degunino', 'Horoshevo-Mnevniki', 'Chertanovo Severnoe', 'Zjuzino', 'Poselenie Moskovskij', 'Izmajlovo', 'Obruchevskoe', 'Solncevo', "Kuz'minki", 'Fili Davydkovo', 'Juzhnoe Tushino', 'Kosino-Uhtomskoe', 'Poselenie Sosenskoe', 'Begovoe', 'Lomonosovskoe', 'Meshhanskoe', 'Presnenskoe', 'Vyhino-Zhulebino', 'Filevskij Park', 'Mozhajskoe', 'Juzhnoe Butovo', 'Poselenie Kievskij', 'Poselenie Desjonovskoe', 'Poselenie Kokoshkino', 'Ochakovo-Matveevskoe', "Sokol'niki", 'Severnoe Izmajlovo', 'Lefortovo', 'Jakimanka', 'Sokolinaja Gora', 'Juzhnoportovoe', 'Staroe Krjukovo', 'Butyrskoe', 'Pechatniki', 'Shhukino', 'Poselenie Krasnopahorskoe', 'Golovinskoe', 'Orehovo-Borisovo Juzhnoe', 'Severnoe Medvedkovo', 'Teplyj Stan', 'Juzhnoe Medvedkovo', 'Poselenie Mosrentgen', 'Poselenie Vnukovskoe', 'Orehovo-Borisovo Severnoe', 'Preobrazhenskoe', 'Ajeroport', 'Kotlovka', 'Pokrovskoe Streshnevo', 'Lianozovo', 'Birjulevo Zapadnoe', 'Prospekt Vernadskogo', 'Poselenie Voronovskoe', 'Nekrasovka', 'Alekseevskoe', 'Poselenie Mihajlovo-Jarcevskoe', 'Hamovniki', 'Metrogorodok', 'Severnoe Tushino', 'Jaroslavskoe', 'Tverskoe', 'Troparevo-Nikulino', 'Poselenie Filimonkovskoe', 'Horoshevskoe', "Mar'ina Roshha", 'Losinoostrovskoe', 'Taganskoe', 'Arbat', "Krasnosel'skoe", 'Matushkino', 'Savelovskoe', 'Caricyno', 'Timirjazevskoe', "Tekstil'shhiki", 'Dorogomilovo', "Gol'janovo", 'Brateevo', 'Poselenie Klenovskoe', 'Severnoe Butovo', 'Rostokino', 'Veshnjaki', 'Basmannoe', 'Mitino', 'Vostochnoe Izmajlovo', 'Novokosino', 'Vostochnoe', 'Strogino', 'Kapotnja', 'Jasenevo', 'Gagarinskoe', 'Novogireevo', 'Akademicheskoe', 'Levoberezhnoe', 'Silino', 'Poselenie Marushkinskoe'}

ID metro
Number of categories: 219
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223}

office raion
Number of categories: 30
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 141, 14, 16, 19, 20, 23, 24, 27, 37, 39, 45, 48, 56, 59, 73, 84, 87, 93}

sport objects raion
Number of categories: 24
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 20, 23, 24, 25, 29}

raion popul
Number of categories: 146
{90114, 116742, 6161, 28179, 76308, 80917, 68630, 53786, 94236, 41504, 71715, 8227, 12327, 83502, 21040, 77878, 139322, 118843, 81980, 57405, 13890, 78418, 178264, 85083, 112221, 101982, 4199, 5740, 61039, 75377, 123000, 37502, 142462, 67710, 2693, 108171, 57995, 47245, 57999, 115352, 153248, 87713, 118945, 21155, 112804, 145576, 78507, 247469, 7341, 130229, 125111, 129207, 38075, 102590, 105663, 96959, 145088, 86206, 8384, 56535, 79576, 85721, 102618, 156377, 85219, 113897, 174831, 132349, 51455, 111874, 111374, 57107, 43795, 78616, 12061, 155427, 55590, 178473, 143661, 73007, 48439, 36154, 21819, 64317, 26943, 103746, 102726, 32071, 101708, 9553, 157010, 17236, 55125, 4949, 27992, 130396, 165727, 94561, 94564, 7538, 89971, 28537, 89467, 76156, 17790, 76670, 2942, 83844, 123280, 166803, 80791, 60315, 175518, 4001, 142243, 64931, 83369, 125354, 102828, 37807, 111023, 155572, 65972, 73148, 31167, 39873, 3521, 72131, 85956, 106445, 7122, 26578, 61396, 219609, 78810, 104410, 91100, 81887, 19940, 100846, 122862, 32241, 104434, 2546, 122873, 76284}

healthcare centers raion
Number of categories: 7
{0, 1, 2, 3, 4, 5, 6}

school education centers raion
Number of categories: 14
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14}

preschool education centers raion
Number of categories: 13
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13}
____________________________________________
sub area
Number of categories: 145
{'Beskudnikovskoe', 'Danilovskoe', 'Nizhegorodskoe', 'Nagatinskij Zaton', 'Poselenie Shhapovskoe', 'Krjukovo', "Altuf'evskoe", 'Cheremushki', 'Babushkinskoe', "Chertanovo Central'noe", "Mar'ino", 'Otradnoe', 'Nagatino-Sadovniki', 'Poselenie Shherbinka', 'Koptevo', 'Zapadnoe Degunino', 'Poselenie Rogovskoe', 'Sokol', 'Donskoe', 'Ramenki', 'Novo-Peredelkino', 'Poselenie Pervomajskoe', 'Poselenie Rjazanovskoe', 'Ivanovskoe', "Zamoskvorech'e", "Moskvorech'e-Saburovo", 'Sviblovo', 'Zjablikovo', 'Rjazanskij', 'Poselenie Novofedorovskoe', 'Krylatskoe', 'Birjulevo Vostochnoe', 'Hovrino', 'Ostankinskoe', 'Molzhaninovskoe', 'Poselenie Voskresenskoe', 'Vnukovo', 'Chertanovo Juzhnoe', 'Dmitrovskoe', 'Perovo', 'Kuncevo', "Kon'kovo", 'Kurkino', 'Ljublino', 'Bogorodskoe', 'Troickij okrug', 'Vojkovskoe', 'Severnoe', 'Bibirevo', 'Marfino', 'Savelki', 'Nagornoe', 'Vostochnoe Degunino', 'Horoshevo-Mnevniki', 'Chertanovo Severnoe', 'Zjuzino', 'Poselenie Moskovskij', 'Izmajlovo', 'Obruchevskoe', 'Solncevo', "Kuz'minki", 'Fili Davydkovo', 'Juzhnoe Tushino', 'Kosino-Uhtomskoe', 'Poselenie Sosenskoe', 'Begovoe', 'Lomonosovskoe', 'Meshhanskoe', 'Presnenskoe', 'Vyhino-Zhulebino', 'Filevskij Park', 'Mozhajskoe', 'Juzhnoe Butovo', 'Poselenie Kievskij', 'Poselenie Desjonovskoe', 'Poselenie Kokoshkino', 'Ochakovo-Matveevskoe', "Sokol'niki", 'Severnoe Izmajlovo', 'Lefortovo', 'Jakimanka', 'Sokolinaja Gora', 'Juzhnoportovoe', 'Staroe Krjukovo', 'Butyrskoe', 'Pechatniki', 'Shhukino', 'Poselenie Krasnopahorskoe', 'Orehovo-Borisovo Juzhnoe', 'Golovinskoe', 'Severnoe Medvedkovo', 'Teplyj Stan', 'Juzhnoe Medvedkovo', 'Poselenie Mosrentgen', 'Poselenie Vnukovskoe', 'Orehovo-Borisovo Severnoe', 'Preobrazhenskoe', 'Ajeroport', 'Kotlovka', 'Pokrovskoe Streshnevo', 'Prospekt Vernadskogo', 'Birjulevo Zapadnoe', 'Lianozovo', 'Poselenie Voronovskoe', 'Nekrasovka', 'Alekseevskoe', 'Poselenie Mihajlovo-Jarcevskoe', 'Hamovniki', 'Metrogorodok', 'Severnoe Tushino', 'Jaroslavskoe', 'Tverskoe', 'Troparevo-Nikulino', 'Poselenie Filimonkovskoe', 'Arbat', 'Taganskoe', "Mar'ina Roshha", 'Horoshevskoe', 'Losinoostrovskoe', "Krasnosel'skoe", 'Matushkino', 'Savelovskoe', 'Caricyno', 'Timirjazevskoe', "Tekstil'shhiki", 'Dorogomilovo', "Gol'janovo", 'Brateevo', 'Rostokino', 'Severnoe Butovo', 'Veshnjaki', 'Basmannoe', 'Mitino', 'Vostochnoe Izmajlovo', 'Novokosino', 'Vostochnoe', 'Strogino', 'Silino', 'Jasenevo', 'Gagarinskoe', 'Novogireevo', 'Akademicheskoe', 'Levoberezhnoe', 'Kapotnja', 'Poselenie Marushkinskoe'}

ID metro
Number of categories: 212
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 161, 162, 163, 164, 165, 166, 167, 168, 170, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 193, 194, 195, 196, 197, 199, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 215, 216, 219, 220, 221, 222, 224}

office raion
Number of categories: 30
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 141, 14, 16, 19, 20, 23, 24, 27, 37, 39, 45, 48, 56, 59, 73, 84, 87, 93}

sport objects raion
Number of categories: 24
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 20, 23, 24, 25, 29}

raion popul
Number of categories: 145
{90114, 116742, 6161, 28179, 76308, 80917, 68630, 53786, 94236, 41504, 71715, 8227, 12327, 83502, 21040, 77878, 139322, 118843, 81980, 57405, 13890, 78418, 178264, 85083, 112221, 101982, 4199, 5740, 61039, 75377, 123000, 37502, 67710, 142462, 2693, 57995, 108171, 47245, 57999, 115352, 153248, 118945, 87713, 21155, 112804, 145576, 78507, 247469, 7341, 130229, 129207, 125111, 38075, 86206, 96959, 145088, 102590, 105663, 8384, 56535, 79576, 156377, 102618, 85721, 85219, 113897, 174831, 132349, 51455, 111874, 111374, 57107, 43795, 78616, 12061, 155427, 55590, 178473, 143661, 73007, 48439, 36154, 21819, 64317, 26943, 103746, 102726, 32071, 101708, 9553, 157010, 17236, 55125, 4949, 27992, 130396, 165727, 94561, 94564, 7538, 89971, 28537, 89467, 76156, 76670, 17790, 83844, 123280, 166803, 80791, 60315, 175518, 4001, 64931, 142243, 83369, 125354, 102828, 111023, 37807, 155572, 65972, 73148, 31167, 39873, 3521, 72131, 85956, 106445, 7122, 26578, 61396, 219609, 78810, 104410, 91100, 81887, 19940, 122862, 100846, 32241, 2546, 104434, 122873, 76284}

healthcare centers raion
Number of categories: 7
{0, 1, 2, 3, 4, 5, 6}

school education centers raion
Number of categories: 14
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14}

preschool education centers raion
Number of categories: 13
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13}
In [34]:
for feature in X_list_cat:
    for element in list(set(df_test_cat[feature])):
        if element not in list(set(df_train_cat[feature])): 
            print (feature, element)
ID_metro 224
In [35]:
ID_metro_cat = pd.factorize(df_train_cat['ID_metro'])
df_train_cat['ID_metro'] = ID_metro_cat[0]

ID_metro_pairs = dict(zip(list(ID_metro_cat[1]), list(set(ID_metro_cat[0]))))
ID_metro_pairs[224] = 219

df_test_cat['ID_metro'].replace(ID_metro_pairs,inplace=True)
In [36]:
for feature in X_list_cat:
    if feature !='ID_metro':
        feature_cat = pd.factorize(df_train_cat[feature])
        df_train_cat[feature] = feature_cat[0]
        feature_pairs = dict(zip(list(feature_cat[1]), list(set(feature_cat[0]))))
        df_test_cat[feature].replace(feature_pairs,inplace=True)
In [37]:
for df in [df_train_cat, df_test_cat]:
    print ("____________________________________________")
    
    print('sub area')
    print('Number of categories:', len(set(df['sub_area'])))
    print(set(df['sub_area']))

    print('\nID metro')
    print('Number of categories:', len(set(df['ID_metro'])))
    print(set(df['ID_metro']))

    print('\noffice raion')
    print('Number of categories:', len(set(df['office_raion'])))
    print(set(df['office_raion']))

    print('\nsport objects raion')
    print('Number of categories:', len(set(df['sport_objects_raion'])))
    print(set(df_train_cat['sport_objects_raion']))

    print('\nraion popul')
    print('Number of categories:', len(set(df['raion_popul'])))
    print(set(df['raion_popul']))

    print('\nhealthcare centers raion')
    print('Number of categories:', len(set(df_train_cat['healthcare_centers_raion'])))
    print(set(df['healthcare_centers_raion']))

    print('\nschool education centers raion')
    print('Number of categories:', len(set(df['school_education_centers_raion'])))
    print(set(df['school_education_centers_raion']))

    print('\npreschool education centers raion')
    print('Number of categories:', len(set(df['preschool_education_centers_raion'])))
    print(set(df['preschool_education_centers_raion']))
____________________________________________
sub area
Number of categories: 146
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145}

ID metro
Number of categories: 219
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218}

office raion
Number of categories: 30
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29}

sport objects raion
Number of categories: 24
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}

raion popul
Number of categories: 146
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145}

healthcare centers raion
Number of categories: 7
{0, 1, 2, 3, 4, 5, 6}

school education centers raion
Number of categories: 14
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13}

preschool education centers raion
Number of categories: 13
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}
____________________________________________
sub area
Number of categories: 145
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 137, 138, 139, 140, 141, 142, 143, 144, 145}

ID metro
Number of categories: 212
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 180, 181, 182, 184, 185, 186, 187, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 206, 207, 208, 209, 210, 211, 212, 213, 215, 218, 219}

office raion
Number of categories: 30
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29}

sport objects raion
Number of categories: 24
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}

raion popul
Number of categories: 145
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 137, 138, 139, 140, 141, 142, 143, 144, 145}

healthcare centers raion
Number of categories: 7
{0, 1, 2, 3, 4, 5, 6}

school education centers raion
Number of categories: 14
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13}

preschool education centers raion
Number of categories: 13
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}
In [41]:
df_train_cat1 = df_train_cat
encode = OneHotEncoder(sparse=False)

for column in X_list_cat:
    encode.fit(df_train_cat[[column]])
    transform = encode.transform(df_train_cat[[column]])
    
    transform = pd.DataFrame(transform, 
                             columns=[(column+"_"+str(i)) for i in df_train_cat[column].value_counts().index])
    transform = transform.set_index(df_train_cat.index.values)
    
    df_train_cat1 = pd.concat([df_train_cat1, transform], axis=1)
    df_train_cat1 = df_train_cat1.drop(column, 1)
In [39]:
df_test_cat1 = df_test_cat
encode = OneHotEncoder(sparse=False)

for column in X_list_cat:
    encode.fit(df_test_cat[[column]])
    transform = encode.transform(df_test_cat[[column]])
    
    transform = pd.DataFrame(transform, 
                             columns=[(column+"_"+str(i)) for i in df_test_cat[column].value_counts().index])
    transform = transform.set_index(df_test_cat.index.values)
    
    df_test_cat1 = pd.concat([df_test_cat1, transform], axis=1)
    df_test_cat1 = df_test_cat1.drop(column, 1)
In [42]:
# Check Encoding
df_train_cat1.iloc[:, 623:636][:3].as_matrix()
Out[42]:
array([[ 1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.]])
In [43]:
df_train_cat['preschool_education_centers_raion'][:3]
Out[43]:
7672    0
8056    1
8111    2
Name: preschool_education_centers_raion, dtype: int64
In [44]:
print('Shape of the train data frame:', df_train_cat1.shape)
print('Shape of the test data frame:', df_test_cat1.shape)
Shape of the train data frame: (20899, 636)
Shape of the test data frame: (7662, 626)
In [45]:
print("Features in the train data, but not in the test data:")
for element in list(df_train_cat1):
    if element not in list(df_test_cat1):
        print(element)
Features in the train data, but not in the test data:
prices
sub_area_136
ID_metro_188
ID_metro_205
ID_metro_216
ID_metro_214
ID_metro_183
ID_metro_179
ID_metro_153
ID_metro_217
raion_popul_136
In [46]:
print("Features in the test data, but not in the train data:")
for element in list(df_test_cat1):
    if element not in list(df_train_cat1):
        print(element)
Features in the test data, but not in the train data:
ID_metro_219
In [47]:
for column in ['sub_area_136',' ID_metro_188', 'ID_metro_205', 'ID_metro_216', 'ID_metro_214',
              'ID_metro_183',' ID_metro_179', 'ID_metro_153', 'ID_metro_217', 'raion_popul_136']:
    df_test_cat1[column] = 0
    
df_train_cat1['ID_metro_219'] = 0

print('Columns with zero values were added.\n')
print('Shape of the train data frame:', df_train_cat1.shape)
print('Shape of the test data frame:', df_test_cat1.shape)
Columns with zero values were added.

Shape of the train data frame: (20899, 637)
Shape of the test data frame: (7662, 636)

3.4 Displaying correlation

In [48]:
pearson = df_train.corr(method='pearson')
corr_with_prices = pearson.ix[-1][:-1]
corr_with_prices[abs(corr_with_prices).argsort()[::-1]]
Out[48]:
full_sq                              0.593829
num_room                             0.476337
kremlin_km                          -0.290126
stadium_km                          -0.238431
detention_facility_km               -0.233395
university_km                       -0.222964
theater_km                          -0.222873
workplaces_km                       -0.220889
swim_pool_km                        -0.220480
exhibition_km                       -0.212144
radiation_km                        -0.208256
museum_km                           -0.203846
park_km                             -0.201636
metro_min_walk                      -0.200058
fitness_km                          -0.197702
metro_km_avto                       -0.194751
shopping_centers_km                 -0.182459
public_healthcare_km                -0.182388
big_road2_km                        -0.178865
bus_terminal_avto_km                -0.176601
ekder_all                            0.169331
area_m                              -0.167851
school_km                           -0.158775
preschool_km                        -0.157079
additional_education_km             -0.146074
kindergarten_km                     -0.141627
work_all                             0.136761
railroad_station_walk_min           -0.135099
oil_chemistry_km                    -0.134873
railroad_station_avto_km            -0.132209
young_all                            0.131324
public_transport_station_min_walk   -0.128647
big_road1_km                        -0.098968
usdrub                               0.069506
big_market_km                       -0.069257
cemetery_km                         -0.042413
Name: prices, dtype: float64
In [50]:
features_list2 = corr_with_prices[abs(corr_with_prices).argsort()[::-1]][:32].index.values.tolist()
print(features_list2)
['full_sq', 'num_room', 'kremlin_km', 'stadium_km', 'detention_facility_km', 'university_km', 'theater_km', 'workplaces_km', 'swim_pool_km', 'exhibition_km', 'radiation_km', 'museum_km', 'park_km', 'metro_min_walk', 'fitness_km', 'metro_km_avto', 'shopping_centers_km', 'public_healthcare_km', 'big_road2_km', 'bus_terminal_avto_km', 'ekder_all', 'area_m', 'school_km', 'preschool_km', 'additional_education_km', 'kindergarten_km', 'work_all', 'railroad_station_walk_min', 'oil_chemistry_km', 'railroad_station_avto_km', 'young_all', 'public_transport_station_min_walk']

3.5 Scale, Shuffle and Split the Data

In [51]:
target_train = df_train['prices'].as_matrix()

features_train = df_train.drop('prices', 1).as_matrix()
features_test = df_test.as_matrix()

features_train_cat = df_train_cat.drop('prices', 1).as_matrix()
features_test_cat = df_test_cat.as_matrix()

features_train_cat_enc = df_train_cat1.drop('prices', 1).as_matrix()
features_test_cat_enc = df_test_cat1.as_matrix()
In [52]:
print('Numeric Features')
X_train, X_test, y_train, y_test = \
train_test_split(features_train, target_train, test_size = 0.2, random_state = 1)
X_train.shape, X_test.shape
Numeric Features
Out[52]:
((16719, 36), (4180, 36))
In [53]:
print('Numeric and Categorical Features')
X_train_cat, X_test_cat, y_train_cat, y_test_cat = \
train_test_split(features_train_cat, target_train, test_size = 0.2, random_state = 1)
X_train_cat.shape, X_test_cat.shape
Numeric and Categorical Features
Out[53]:
((16719, 44), (4180, 44))
In [54]:
print('Numeric and Encoded Categorical Features')
X_train_cat_enc, X_test_cat_enc, y_train_cat_enc, y_test_cat_enc = \
train_test_split(features_train_cat_enc, target_train, test_size = 0.2, random_state = 1)
X_train_cat_enc.shape, X_test_cat_enc.shape
Numeric and Encoded Categorical Features
Out[54]:
((16719, 636), (4180, 636))
In [55]:
scale_X = RobustScaler()
X_train = scale_X.fit_transform(X_train)
X_test = scale_X.transform(X_test)

scale_y = RobustScaler()
y_train = scale_y.fit_transform(y_train.reshape(-1,1))
y_test = scale_y.transform(y_test.reshape(-1,1))

scale_X_cat = RobustScaler()
X_train_cat = scale_X_cat.fit_transform(X_train_cat)
X_test_cat = scale_X_cat.transform(X_test_cat)

scale_y_cat = RobustScaler()
y_train_cat = scale_y_cat.fit_transform(y_train_cat.reshape(-1,1))
y_test_cat = scale_y_cat.transform(y_test_cat.reshape(-1,1))

scale_X_cat_enc = RobustScaler()
X_train_cat_enc = scale_X_cat_enc.fit_transform(X_train_cat_enc)
X_test_cat_enc = scale_X_cat_enc.transform(X_test_cat_enc)

scale_y_cat_enc = RobustScaler()
y_train_cat_enc = scale_y_cat_enc.fit_transform(y_train_cat_enc.reshape(-1,1))
y_test_cat_enc = scale_y_cat_enc.transform(y_test_cat_enc.reshape(-1,1))

4. Benchmark Models

4.1 Regressors; Scikit-Learn

Tuning Parameters

In [58]:
print('Numeric Features')
print ('Gradient Boosting Regressor')
param_grid_gbr = {'max_depth': [3, 4, 5], 'n_estimators': range(36, 361, 36)}
gridsearch_gbr = GridSearchCV(GradientBoostingRegressor(), 
                              param_grid_gbr, n_jobs=5).fit(X_train, y_train)

gridsearch_gbr.best_params_
Numeric Features
Gradient Boosting Regressor
Out[58]:
{'max_depth': 4, 'n_estimators': 360}
In [ ]:
print ('Bagging Regressor')
param_grid_br = {'n_estimators': range(36, 361, 36)}
gridsearch_br = GridSearchCV(BaggingRegressor(), 
                             param_grid_br, n_jobs=5).fit(X_train, y_train)

gridsearch_br.best_params_
In [ ]:
print('Numeric and Categorical Features')
print ('Gradient Boosting Regressor')
param_grid_gbr_cat = {'max_depth': [3, 4, 5], 'n_estimators': range(44, 441, 44)}
gridsearch_gbr_cat = GridSearchCV(GradientBoostingRegressor(), 
                                  param_grid_gbr_cat, n_jobs=5).fit(X_train_cat, y_train_cat)

gridsearch_gbr_cat.best_params_
In [ ]:
print ('Bagging Regressor')
param_grid_br_cat = {'n_estimators': range(44, 441, 44)}
gridsearch_br_cat = GridSearchCV(BaggingRegressor(), 
                                 param_grid_br_cat, n_jobs=5).fit(X_train_cat, y_train_cat)

gridsearch_br_cat.best_params_
In [ ]:
print('Numeric and Encoded Categorical Features')
print ('Gradient Boosting Regressor')
param_grid_gbr_cat_enc = {'max_depth': [3, 4, 5], 'n_estimators': [159, 318, 636]}
gridsearch_gbr_cat_enc = GridSearchCV(GradientBoostingRegressor(), 
                                      param_grid_gbr_cat_enc, 
                                      n_jobs=5).fit(X_train_cat_enc, y_train_cat_enc)

gridsearch_gbr_cat_enc.best_params_
In [ ]:
print ('Bagging Regressor')
param_grid_br_cat_enc = {'n_estimators': [159, 318, 636]}
gridsearch_br_cat_enc = GridSearchCV(BaggingRegressor(), 
                                     param_grid_br_cat_enc, 
                                     n_jobs=5).fit(X_train_cat_enc, y_train_cat_enc)

gridsearch_br_cat_enc.best_params_

Fit the Regressors

In [59]:
print('Numeric Features')
y_train_gbr, y_test_gbr = regression(GradientBoostingRegressor(max_depth=4, n_estimators=360), 
                                     X_train, X_test, y_train)

y_train_br, y_test_br = regression(BaggingRegressor(n_estimators=252), 
                                   X_train, X_test, y_train)

scores('GradientBoostingRegressor', y_train, y_test, y_train_gbr, y_test_gbr)
scores('BaggingRegressor', y_train, y_test, y_train_br, y_test_br)
Numeric Features
_______________________________________
GradientBoostingRegressor
_______________________________________
EV score. Train:  0.86189746402
EV score. Test:  0.718508531221
---------
R2 score. Train:  0.86189746402
R2 score. Test:  0.718420278233
---------
MSE score. Train:  0.251150449123
MSE score. Test:  0.563413959388
---------
MAE score. Train:  0.31458911313
MAE score. Test:  0.401407961457
---------
MdAE score. Train:  0.174402117839
MdAE score. Test:  0.20046676225
_______________________________________
BaggingRegressor
_______________________________________
EV score. Train:  0.955106181613
EV score. Test:  0.722960538307
---------
R2 score. Train:  0.955079582338
R2 score. Test:  0.72258768868
---------
MSE score. Train:  0.0816913533877
MSE score. Test:  0.555075371631
---------
MAE score. Train:  0.147493252721
MAE score. Test:  0.390699976309
---------
MdAE score. Train:  0.0642587643561
MdAE score. Test:  0.176114624118
In [60]:
print('Numeric and Categorical Features')
y_train_cat_gbr, y_test_cat_gbr = \
regression(GradientBoostingRegressor(max_depth=3, n_estimators=396), X_train_cat, X_test_cat, y_train_cat)

y_train_cat_br, y_test_cat_br = \
regression(BaggingRegressor(n_estimators=220), X_train_cat, X_test_cat, y_train_cat)

scores('GradientBoostingRegressor', 
       y_train_cat, y_test_cat, y_train_cat_gbr, y_test_cat_gbr)
scores('BaggingRegressor', 
       y_train_cat, y_test_cat, y_train_cat_br, y_test_cat_br)
Numeric and Categorical Features
_______________________________________
GradientBoostingRegressor
_______________________________________
EV score. Train:  0.819256487057
EV score. Test:  0.71645861476
---------
R2 score. Train:  0.819256487057
R2 score. Test:  0.716379131697
---------
MSE score. Train:  0.328696458248
MSE score. Test:  0.567498097423
---------
MAE score. Train:  0.352419590753
MAE score. Test:  0.407158493333
---------
MdAE score. Train:  0.190394737254
MdAE score. Test:  0.204270027971
_______________________________________
BaggingRegressor
_______________________________________
EV score. Train:  0.956343639183
EV score. Test:  0.718511684563
---------
R2 score. Train:  0.956317154125
R2 score. Test:  0.718073805467
---------
MSE score. Train:  0.0794407306331
MSE score. Test:  0.564107218092
---------
MAE score. Train:  0.147531935307
MAE score. Test:  0.39393306598
---------
MdAE score. Train:  0.063952020202
MdAE score. Test:  0.177101090699
In [61]:
print('Numeric and Encoded Categorical Features')
y_train_cat_enc_gbr, y_test_cat_enc_gbr = \
regression(GradientBoostingRegressor(max_depth=3, n_estimators=159), 
           X_train_cat_enc, X_test_cat_enc, y_train_cat_enc)

y_train_cat_enc_br, y_test_cat_enc_br = \
regression(BaggingRegressor(n_estimators=159), 
           X_train_cat_enc, X_test_cat_enc, y_train_cat_enc)

scores('GradientBoostingRegressor', 
       y_train_cat_enc, y_test_cat_enc, y_train_cat_enc_gbr, y_test_cat_enc_gbr)
scores('BaggingRegressor', 
       y_train_cat_enc, y_test_cat_enc, y_train_cat_enc_br, y_test_cat_enc_br)
Numeric and Encoded Categorical Features
_______________________________________
GradientBoostingRegressor
_______________________________________
EV score. Train:  0.769881646987
EV score. Test:  0.7088634031
---------
R2 score. Train:  0.769881646987
R2 score. Test:  0.708819668777
---------
MSE score. Train:  0.418488533179
MSE score. Test:  0.582623856153
---------
MAE score. Train:  0.390531067732
MAE score. Test:  0.419840535779
---------
MdAE score. Train:  0.205749300262
MdAE score. Test:  0.217442154308
_______________________________________
BaggingRegressor
_______________________________________
EV score. Train:  0.955423070496
EV score. Test:  0.713838756665
---------
R2 score. Train:  0.955402817232
R2 score. Test:  0.713429887988
---------
MSE score. Train:  0.0811035250176
MSE score. Test:  0.573399250619
---------
MAE score. Train:  0.147635780389
MAE score. Test:  0.394442152896
---------
MdAE score. Train:  0.0638861547869
MdAE score. Test:  0.177345781796

MLP Regressors

In [62]:
mlpr = MLPRegressor(hidden_layer_sizes=(324,), max_iter=200, 
                    solver='lbfgs', alpha=0.01)
mlpr.fit(X_train, y_train)

y_train_mlpr = mlpr.predict(X_train)
y_test_mlpr = mlpr.predict(X_test)

scores('MLP Regressor; Numeric Features', 
       y_train, y_test, y_train_mlpr, y_test_mlpr)
_______________________________________
MLP Regressor; Numeric Features
_______________________________________
EV score. Train:  0.706889039064
EV score. Test:  0.688762277467
---------
R2 score. Train:  0.706820223315
R2 score. Test:  0.688548277267
---------
MSE score. Train:  0.533170749296
MSE score. Test:  0.62318496219
---------
MAE score. Train:  0.418133978675
MAE score. Test:  0.437912531306
---------
MdAE score. Train:  0.228135248198
MdAE score. Test:  0.238360223016
In [63]:
mlpr_cat = MLPRegressor(hidden_layer_sizes=(396,), max_iter=200, 
                        solver='lbfgs', alpha=0.01)
mlpr_cat.fit(X_train_cat, y_train_cat)

y_train_cat_mlpr = mlpr_cat.predict(X_train_cat)
y_test_cat_mlpr = mlpr_cat.predict(X_test_cat)

scores('MLP Regressor; Numeric and Categorical Features', 
       y_train_cat, y_test_cat, y_train_cat_mlpr, y_test_cat_mlpr)
_______________________________________
MLP Regressor; Numeric and Categorical Features
_______________________________________
EV score. Train:  0.721571506386
EV score. Test:  0.683064970679
---------
R2 score. Train:  0.721571359022
R2 score. Test:  0.682956895884
---------
MSE score. Train:  0.506344635412
MSE score. Test:  0.634372778926
---------
MAE score. Train:  0.411185894824
MAE score. Test:  0.444609550542
---------
MdAE score. Train:  0.224697762204
MdAE score. Test:  0.24287476303
In [64]:
mlpr_cat_enc = MLPRegressor(hidden_layer_sizes=(318,), max_iter=200, 
                            solver='lbfgs', alpha=0.01)
mlpr_cat_enc.fit(X_train_cat_enc, y_train_cat_enc)

y_train_cat_enc_mlpr = mlpr_cat_enc.predict(X_train_cat_enc)
y_test_cat_enc_mlpr = mlpr_cat_enc.predict(X_test_cat_enc)

scores('MLP Regressor; Numeric and Encoded Categorical Features', 
       y_train_cat_enc, y_test_cat_enc, y_train_cat_enc_mlpr, y_test_cat_enc_mlpr)
_______________________________________
MLP Regressor; Numeric and Encoded Categorical Features
_______________________________________
EV score. Train:  0.758762806619
EV score. Test:  0.689873127397
---------
R2 score. Train:  0.758738012931
R2 score. Test:  0.689787708666
---------
MSE score. Train:  0.438754118298
MSE score. Test:  0.620704979089
---------
MAE score. Train:  0.396668717191
MAE score. Test:  0.440850467304
---------
MdAE score. Train:  0.217031245932
MdAE score. Test:  0.236478208425

Display Predictions

In [65]:
plt.figure(figsize = (18, 6))

plt.plot(y_test[1:50], color = 'black', label='Real Data')

plt.plot(y_test_gbr[1:50], label='Gradient Boosting')
plt.plot(y_test_br[1:50], label='Bagging Regressor')
plt.plot(y_test_mlpr[1:50], label='MLP Regressor')

plt.legend()
plt.title("Numeric Features; Regressor Predictions vs Real Data");
In [66]:
plt.figure(figsize = (18, 6))

plt.plot(y_test_cat[1:50], color = 'black', label='Real Data')

plt.plot(y_test_cat_gbr[1:50], label='Gradient Boosting')
plt.plot(y_test_cat_br[1:50], label='Bagging Regressor')
plt.plot(y_test_cat_mlpr[1:50], label='MLP Regressor')

plt.legend()
plt.title("Numeric and Categorical Features; Regressor Predictions vs Real Data");
In [67]:
plt.figure(figsize = (18, 6))

plt.plot(y_test_cat_enc[1:50], color = 'black', label='Real Data')

plt.plot(y_test_cat_enc_gbr[1:50], label='Gradient Boosting')
plt.plot(y_test_cat_enc_br[1:50], label='Bagging Regressor')
plt.plot(y_test_cat_enc_mlpr[1:50], label='MLP Regressor')

plt.legend()
plt.title("Numeric and Encoded Categorical Features; Regressor Predictions vs Real Data");

4.2 Neural Networks

MLP

In [142]:
def mlp_model():
    model = Sequential()
    
    model.add(Dense(1152, activation='relu', input_dim=36))   
    model.add(Dense(288, activation='relu'))    
    model.add(Dense(72, activation='relu'))
    
    model.add(Dense(1))
    
    model.compile(loss='mse', optimizer='adam', metrics=['mae'])
    return model
In [143]:
mlp_model = mlp_model()
mlp_checkpointer = ModelCheckpoint(filepath='weights.best.mlp_reg.sberbank.hdf5', 
                                   verbose=2, save_best_only=True)

mlp_history = mlp_model.fit(X_train, y_train, validation_data=(X_test, y_test),
                            nb_epoch=15, batch_size=128, verbose=0, callbacks=[mlp_checkpointer])
Epoch 00000: val_loss improved from inf to 0.70540, saving model to weights.best.mlp_reg.sberbank.hdf5
Epoch 00001: val_loss improved from 0.70540 to 0.65839, saving model to weights.best.mlp_reg.sberbank.hdf5
Epoch 00002: val_loss improved from 0.65839 to 0.62783, saving model to weights.best.mlp_reg.sberbank.hdf5
Epoch 00003: val_loss did not improve
Epoch 00004: val_loss did not improve
Epoch 00005: val_loss did not improve
Epoch 00006: val_loss improved from 0.62783 to 0.60627, saving model to weights.best.mlp_reg.sberbank.hdf5
Epoch 00007: val_loss did not improve
Epoch 00008: val_loss did not improve
Epoch 00009: val_loss did not improve
Epoch 00010: val_loss did not improve
Epoch 00011: val_loss did not improve
Epoch 00012: val_loss did not improve
Epoch 00013: val_loss did not improve
Epoch 00014: val_loss did not improve
In [144]:
loss_plot(mlp_history)
mae_plot(mlp_history)
In [145]:
mlp_model.load_weights('weights.best.mlp_reg.sberbank.hdf5')

y_train_mlp = mlp_model.predict(X_train)
y_test_mlp = mlp_model.predict(X_test)

scores('MLP Model; Numeric Features', y_train, y_test, y_train_mlp, y_test_mlp)
_______________________________________
MLP Model; Numeric Features
_______________________________________
EV score. Train:  0.741240088393
EV score. Test:  0.697208416483
---------
R2 score. Train:  0.741111890994
R2 score. Test:  0.697001567716
---------
MSE score. Train:  0.470808623376
MSE score. Test:  0.606270740485
---------
MAE score. Train:  0.404431734854
MAE score. Test:  0.428410827808
---------
MdAE score. Train:  0.209477743043
MdAE score. Test:  0.217244909339
In [146]:
mlp_model.save('kaggle_sberbank_mlp_reg_model.h5')
In [148]:
def mlp_cat_model():
    model = Sequential()
    
    model.add(Dense(88*16, activation='relu', input_dim=44))   
    model.add(Dense(88*4, activation='relu'))    
    model.add(Dense(88, activation='relu'))
    
    model.add(Dense(1))
    
    model.compile(loss='mse', optimizer='adam', metrics=['mae'])
    return model
In [149]:
mlp_cat_model = mlp_cat_model()
mlp_cat_checkpointer = ModelCheckpoint(filepath='weights.best.mlp_cat_reg.sberbank.hdf5', 
                                       verbose=2, save_best_only=True)
mlp_cat_history = mlp_cat_model.fit(X_train_cat, y_train_cat, 
                                    validation_data=(X_test_cat, y_test_cat),
                                    nb_epoch=10, batch_size=128, verbose=0, callbacks=[mlp_cat_checkpointer])
Epoch 00000: val_loss improved from inf to 0.65970, saving model to weights.best.mlp_cat_reg.sberbank.hdf5
Epoch 00001: val_loss did not improve
Epoch 00002: val_loss did not improve
Epoch 00003: val_loss did not improve
Epoch 00004: val_loss improved from 0.65970 to 0.64135, saving model to weights.best.mlp_cat_reg.sberbank.hdf5
Epoch 00005: val_loss improved from 0.64135 to 0.62193, saving model to weights.best.mlp_cat_reg.sberbank.hdf5
Epoch 00006: val_loss did not improve
Epoch 00007: val_loss did not improve
Epoch 00008: val_loss did not improve
Epoch 00009: val_loss did not improve
In [150]:
loss_plot(mlp_cat_history)
mae_plot(mlp_cat_history)
In [151]:
mlp_cat_model.load_weights('weights.best.mlp_cat_reg.sberbank.hdf5')
y_train_cat_mlp = mlp_cat_model.predict(X_train_cat)
y_test_cat_mlp = mlp_cat_model.predict(X_test_cat)

scores('MLP Model; Numeric and Categorical Features', 
       y_train_cat, y_test_cat, y_train_cat_mlp, y_test_cat_mlp)
_______________________________________
MLP Model; Numeric and Categorical Features
_______________________________________
EV score. Train:  0.762544350768
EV score. Test:  0.690040203052
---------
R2 score. Train:  0.762047043589
R2 score. Test:  0.689174982953
---------
MSE score. Train:  0.432736382781
MSE score. Test:  0.621930984348
---------
MAE score. Train:  0.399402395193
MAE score. Test:  0.43528131985
---------
MdAE score. Train:  0.213113170001
MdAE score. Test:  0.224732577172
In [152]:
mlp_cat_model.save('kaggle_sberbank_mlp_cat_reg_model.h5')
In [159]:
def mlp_cat_enc_model():
    model = Sequential()
    
    model.add(Dense(636*16, activation='relu', input_dim=636))   
    model.add(Dense(636*4, activation='relu'))    
    model.add(Dense(636, activation='relu'))
    
    model.add(Dense(1))
    
    model.compile(loss='mse', optimizer='adam', metrics=['mae'])
    return model
In [160]:
mlp_cat_enc_model = mlp_cat_enc_model()
mlp_cat_enc_checkpointer = ModelCheckpoint(filepath='weights.best.mlp_cat_enc_reg.sberbank.hdf5', 
                                           verbose=2, save_best_only=True)
mlp_cat_enc_history = mlp_cat_enc_model.fit(X_train_cat_enc, y_train_cat_enc, 
                                            validation_data=(X_test_cat_enc, y_test_cat_enc),
                                            nb_epoch=10, batch_size=128, verbose=0, 
                                            callbacks=[mlp_cat_enc_checkpointer])
Epoch 00000: val_loss improved from inf to 0.91033, saving model to weights.best.mlp_cat_enc_reg.sberbank.hdf5
Epoch 00001: val_loss improved from 0.91033 to 0.72600, saving model to weights.best.mlp_cat_enc_reg.sberbank.hdf5
Epoch 00002: val_loss improved from 0.72600 to 0.63536, saving model to weights.best.mlp_cat_enc_reg.sberbank.hdf5
Epoch 00003: val_loss did not improve
Epoch 00004: val_loss did not improve
Epoch 00005: val_loss did not improve
Epoch 00006: val_loss did not improve
Epoch 00007: val_loss did not improve
Epoch 00008: val_loss did not improve
Epoch 00009: val_loss did not improve
In [161]:
loss_plot(mlp_cat_enc_history)
mae_plot(mlp_cat_enc_history)
In [162]:
mlp_cat_enc_model.load_weights('weights.best.mlp_cat_enc_reg.sberbank.hdf5')
y_train_cat_enc_mlp = mlp_cat_enc_model.predict(X_train_cat_enc)
y_test_cat_enc_mlp = mlp_cat_enc_model.predict(X_test_cat_enc)

scores('MLP Model; Numeric and Encoded Categorical Features', 
       y_train_cat_enc, y_test_cat_enc, y_train_cat_enc_mlp, y_test_cat_enc_mlp)
_______________________________________
MLP Model; Numeric and Encoded Categorical Features
_______________________________________
EV score. Train:  0.752346869229
EV score. Test:  0.685095663618
---------
R2 score. Train:  0.750725801504
R2 score. Test:  0.682465973447
---------
MSE score. Train:  0.45332496223
MSE score. Test:  0.635355067538
---------
MAE score. Train:  0.406728727492
MAE score. Test:  0.448880427225
---------
MdAE score. Train:  0.218067453437
MdAE score. Test:  0.233627818745
In [163]:
mlp_cat_enc_model.save('kaggle_sberbank_mlp_cat_enc_reg_model.h5')

CNN

In [165]:
def cnn_model():
    model = Sequential()
        
    model.add(Conv1D(36, 5, padding='valid', activation='relu', input_shape=(36, 1)))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Dropout(0.25))

    model.add(Conv1D(144, 3, padding='valid', activation='relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Dropout(0.25))
    
    model.add(Flatten())

    model.add(Dense(576, kernel_initializer='normal', activation='relu'))
    model.add(Dropout(0.5))

    model.add(Dense(1, kernel_initializer='normal'))
   
    model.compile(loss='mse', optimizer='adam', metrics=['mae'])
    return model
In [166]:
cnn_model = cnn_model()
cnn_checkpointer = ModelCheckpoint(filepath='weights.best.cnn_reg.sberbank.hdf5', 
                                   verbose=2, save_best_only=True)
cnn_history = cnn_model.fit(X_train.reshape(16719, 36, 1), y_train, 
                            epochs=20, batch_size=128, verbose=0, callbacks=[cnn_checkpointer],
                            validation_data=(X_test.reshape(4180, 36, 1), y_test))
Epoch 00000: val_loss improved from inf to 0.91717, saving model to weights.best.cnn_reg.sberbank.hdf5
Epoch 00001: val_loss improved from 0.91717 to 0.79339, saving model to weights.best.cnn_reg.sberbank.hdf5
Epoch 00002: val_loss improved from 0.79339 to 0.75740, saving model to weights.best.cnn_reg.sberbank.hdf5
Epoch 00003: val_loss improved from 0.75740 to 0.72979, saving model to weights.best.cnn_reg.sberbank.hdf5
Epoch 00004: val_loss did not improve
Epoch 00005: val_loss did not improve
Epoch 00006: val_loss improved from 0.72979 to 0.67896, saving model to weights.best.cnn_reg.sberbank.hdf5
Epoch 00007: val_loss improved from 0.67896 to 0.67520, saving model to weights.best.cnn_reg.sberbank.hdf5
Epoch 00008: val_loss improved from 0.67520 to 0.66100, saving model to weights.best.cnn_reg.sberbank.hdf5
Epoch 00009: val_loss improved from 0.66100 to 0.63682, saving model to weights.best.cnn_reg.sberbank.hdf5
Epoch 00010: val_loss did not improve
Epoch 00011: val_loss did not improve
Epoch 00012: val_loss did not improve
Epoch 00013: val_loss improved from 0.63682 to 0.62333, saving model to weights.best.cnn_reg.sberbank.hdf5
Epoch 00014: val_loss improved from 0.62333 to 0.60545, saving model to weights.best.cnn_reg.sberbank.hdf5
Epoch 00015: val_loss did not improve
Epoch 00016: val_loss did not improve
Epoch 00017: val_loss did not improve
Epoch 00018: val_loss did not improve
Epoch 00019: val_loss did not improve
In [167]:
loss_plot(cnn_history)
mae_plot(cnn_history)
In [168]:
cnn_model.load_weights('weights.best.cnn_reg.sberbank.hdf5')
y_train_cnn = cnn_model.predict(X_train.reshape(16719, 36, 1))
y_test_cnn = cnn_model.predict(X_test.reshape(4180, 36, 1))

scores('CNN Model; Numeric Features', y_train, y_test, y_train_cnn, y_test_cnn)
_______________________________________
CNN Model; Numeric Features
_______________________________________
EV score. Train:  0.691333898827
EV score. Test:  0.697516941589
---------
R2 score. Train:  0.691178004766
R2 score. Test:  0.697411215951
---------
MSE score. Train:  0.561617368224
MSE score. Test:  0.605451073741
---------
MAE score. Train:  0.435884981358
MAE score. Test:  0.44552032365
---------
MdAE score. Train:  0.232059417089
MdAE score. Test:  0.232690975796
In [169]:
cnn_model.save('kaggle_sberbank_cnn_reg_model.h5')
In [170]:
def cnn_cat_model():
    model = Sequential()
        
    model.add(Conv1D(44, 5, padding='valid', activation='relu', input_shape=(44, 1)))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Dropout(0.25))

    model.add(Conv1D(156, 3, padding='valid', activation='relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Dropout(0.25))
    
    model.add(Flatten())

    model.add(Dense(624, kernel_initializer='normal', activation='relu'))
    model.add(Dropout(0.5))

    model.add(Dense(1, kernel_initializer='normal'))
    
    model.compile(loss='mse', optimizer='adam', metrics=['mae'])
    return model
In [171]:
cnn_cat_model = cnn_cat_model()
cnn_cat_checkpointer = ModelCheckpoint(filepath='weights.best.cnn_cat_reg.sberbank.hdf5', 
                                       verbose=2, save_best_only=True)

cnn_cat_history = cnn_cat_model.fit(X_train_cat.reshape(16719, 44, 1), y_train_cat, 
                                    epochs=20, batch_size=128, verbose=0, callbacks=[cnn_cat_checkpointer],
                                    validation_data=(X_test_cat.reshape(4180, 44, 1), y_test_cat))
Epoch 00000: val_loss improved from inf to 0.99892, saving model to weights.best.cnn_cat_reg.sberbank.hdf5
Epoch 00001: val_loss improved from 0.99892 to 0.75766, saving model to weights.best.cnn_cat_reg.sberbank.hdf5
Epoch 00002: val_loss improved from 0.75766 to 0.72129, saving model to weights.best.cnn_cat_reg.sberbank.hdf5
Epoch 00003: val_loss improved from 0.72129 to 0.68788, saving model to weights.best.cnn_cat_reg.sberbank.hdf5
Epoch 00004: val_loss did not improve
Epoch 00005: val_loss improved from 0.68788 to 0.64008, saving model to weights.best.cnn_cat_reg.sberbank.hdf5
Epoch 00006: val_loss did not improve
Epoch 00007: val_loss did not improve
Epoch 00008: val_loss did not improve
Epoch 00009: val_loss did not improve
Epoch 00010: val_loss did not improve
Epoch 00011: val_loss improved from 0.64008 to 0.60354, saving model to weights.best.cnn_cat_reg.sberbank.hdf5
Epoch 00012: val_loss did not improve
Epoch 00013: val_loss did not improve
Epoch 00014: val_loss did not improve
Epoch 00015: val_loss did not improve
Epoch 00016: val_loss did not improve
Epoch 00017: val_loss improved from 0.60354 to 0.58372, saving model to weights.best.cnn_cat_reg.sberbank.hdf5
Epoch 00018: val_loss did not improve
Epoch 00019: val_loss did not improve
In [172]:
loss_plot(cnn_cat_history)
mae_plot(cnn_cat_history)
In [173]:
cnn_cat_model.load_weights('weights.best.cnn_cat_reg.sberbank.hdf5')
y_train_cat_cnn = cnn_cat_model.predict(X_train_cat.reshape(16719, 44, 1))
y_test_cat_cnn = cnn_cat_model.predict(X_test_cat.reshape(4180, 44, 1))

scores('CNN Model; Numeric and Categorical Features', 
       y_train_cat, y_test_cat, y_train_cat_cnn, y_test_cat_cnn)
_______________________________________
CNN Model; Numeric and Categorical Features
_______________________________________
EV score. Train:  0.723615037825
EV score. Test:  0.708272674001
---------
R2 score. Train:  0.723602313404
R2 score. Test:  0.708272596267
---------
MSE score. Train:  0.502651183286
MSE score. Test:  0.583718495662
---------
MAE score. Train:  0.421356915094
MAE score. Test:  0.442774879551
---------
MdAE score. Train:  0.227151260906
MdAE score. Test:  0.237504174444
In [174]:
cnn_cat_model.save('kaggle_sberbank_cnn_cat_reg_model.h5')
In [175]:
def cnn_cat_enc_model():
    model = Sequential()
        
    model.add(Conv1D(159, 5, padding='valid', activation='relu', input_shape=(636, 1)))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Dropout(0.25))

    model.add(Conv1D(318, 3, padding='valid', activation='relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Dropout(0.25))
    
    model.add(Flatten())

    model.add(Dense(636, kernel_initializer='normal', activation='relu'))
    model.add(Dropout(0.5))

    model.add(Dense(1, kernel_initializer='normal'))
    
    model.compile(loss='mse', optimizer='adam', metrics=['mae'])
    return model
In [176]:
cnn_cat_enc_model = cnn_cat_enc_model()
cnn_cat_enc_checkpointer = ModelCheckpoint(filepath='weights.best.cnn_cat_enc_reg.sberbank.hdf5', 
                                           verbose=2, save_best_only=True)

cnn_cat_enc_history = \
cnn_cat_enc_model.fit(X_train_cat_enc.reshape(16719, 636, 1), y_train_cat_enc, 
                      epochs=10, batch_size=128, verbose=2, callbacks=[cnn_cat_enc_checkpointer],
                      validation_data=(X_test_cat_enc.reshape(4180, 636, 1), y_test_cat_enc))
Train on 16719 samples, validate on 4180 samples
Epoch 1/10
Epoch 00000: val_loss improved from inf to 0.83689, saving model to weights.best.cnn_cat_enc_reg.sberbank.hdf5
1163s - loss: 1.1313 - mean_absolute_error: 0.6357 - val_loss: 0.8369 - val_mean_absolute_error: 0.5397
Epoch 2/10
Epoch 00001: val_loss improved from 0.83689 to 0.71732, saving model to weights.best.cnn_cat_enc_reg.sberbank.hdf5
958s - loss: 0.7949 - mean_absolute_error: 0.5236 - val_loss: 0.7173 - val_mean_absolute_error: 0.4744
Epoch 3/10
Epoch 00002: val_loss did not improve
885s - loss: 0.7461 - mean_absolute_error: 0.5074 - val_loss: 0.7259 - val_mean_absolute_error: 0.4771
Epoch 4/10
Epoch 00003: val_loss did not improve
774s - loss: 0.7155 - mean_absolute_error: 0.4917 - val_loss: 0.8139 - val_mean_absolute_error: 0.5206
Epoch 5/10
Epoch 00004: val_loss improved from 0.71732 to 0.64979, saving model to weights.best.cnn_cat_enc_reg.sberbank.hdf5
782s - loss: 0.6817 - mean_absolute_error: 0.4861 - val_loss: 0.6498 - val_mean_absolute_error: 0.4800
Epoch 6/10
Epoch 00005: val_loss improved from 0.64979 to 0.64411, saving model to weights.best.cnn_cat_enc_reg.sberbank.hdf5
786s - loss: 0.6613 - mean_absolute_error: 0.4774 - val_loss: 0.6441 - val_mean_absolute_error: 0.4669
Epoch 7/10
Epoch 00006: val_loss did not improve
759s - loss: 0.6404 - mean_absolute_error: 0.4749 - val_loss: 0.6512 - val_mean_absolute_error: 0.4699
Epoch 8/10
Epoch 00007: val_loss did not improve
751s - loss: 0.6935 - mean_absolute_error: 0.4788 - val_loss: 0.6546 - val_mean_absolute_error: 0.4503
Epoch 9/10
Epoch 00008: val_loss improved from 0.64411 to 0.60557, saving model to weights.best.cnn_cat_enc_reg.sberbank.hdf5
782s - loss: 0.6278 - mean_absolute_error: 0.4684 - val_loss: 0.6056 - val_mean_absolute_error: 0.4474
Epoch 10/10
Epoch 00009: val_loss did not improve
749s - loss: 0.6018 - mean_absolute_error: 0.4685 - val_loss: 0.6917 - val_mean_absolute_error: 0.4790
In [177]:
loss_plot(cnn_cat_enc_history)
mae_plot(cnn_cat_enc_history)
In [178]:
cnn_cat_enc_model.load_weights('weights.best.cnn_cat_enc_reg.sberbank.hdf5')
y_train_cat_enc_cnn = cnn_cat_enc_model.predict(X_train_cat_enc.reshape(16719, 636, 1))
y_test_cat_enc_cnn = cnn_cat_enc_model.predict(X_test_cat_enc.reshape(4180, 636, 1))

scores('CNN Model; Numeric and Encoded Categorical Features', 
       y_train_cat_enc, y_test_cat_enc, y_train_cat_enc_cnn, y_test_cat_enc_cnn)
_______________________________________
CNN Model; Numeric and Encoded Categorical Features
_______________________________________
EV score. Train:  0.718508934579
EV score. Test:  0.69771188854
---------
R2 score. Train:  0.718305811781
R2 score. Test:  0.697352342763
---------
MSE score. Train:  0.512283292877
MSE score. Test:  0.605568873331
---------
MAE score. Train:  0.425208330827
MAE score. Test:  0.447436328635
---------
MdAE score. Train:  0.231289068328
MdAE score. Test:  0.234809536397
In [179]:
cnn_cat_enc_model.save('kaggle_sberbank_cnn_cat_enc_reg_model.h5')

RNN

In [185]:
def rnn_model():
    model = Sequential()
    
    model.add(LSTM(144, return_sequences=True, input_shape=(1, 36)))
    model.add(LSTM(576, return_sequences=False))   
    
    model.add(Dense(1))

    model.compile(optimizer='rmsprop', loss='mse', metrics=['mae'])     
    return model
In [186]:
rnn_model = rnn_model()
rnn_checkpointer = ModelCheckpoint(filepath='weights.best.rnn_reg.sberbank.hdf5', 
                                   verbose=2, save_best_only=True)

rnn_history = rnn_model.fit(X_train.reshape(16719, 1, 36), y_train.reshape(16719), 
                            epochs=7, verbose=2, callbacks=[rnn_checkpointer],
                            validation_data=(X_test.reshape(4180, 1, 36), y_test.reshape(4180)))
Train on 16719 samples, validate on 4180 samples
Epoch 1/7
Epoch 00000: val_loss improved from inf to 0.68538, saving model to weights.best.rnn_reg.sberbank.hdf5
71s - loss: 0.8263 - mean_absolute_error: 0.5023 - val_loss: 0.6854 - val_mean_absolute_error: 0.4798
Epoch 2/7
Epoch 00001: val_loss improved from 0.68538 to 0.66981, saving model to weights.best.rnn_reg.sberbank.hdf5
58s - loss: 0.6962 - mean_absolute_error: 0.4657 - val_loss: 0.6698 - val_mean_absolute_error: 0.4564
Epoch 3/7
Epoch 00002: val_loss improved from 0.66981 to 0.64800, saving model to weights.best.rnn_reg.sberbank.hdf5
57s - loss: 0.6554 - mean_absolute_error: 0.4516 - val_loss: 0.6480 - val_mean_absolute_error: 0.4555
Epoch 4/7
Epoch 00003: val_loss improved from 0.64800 to 0.64164, saving model to weights.best.rnn_reg.sberbank.hdf5
56s - loss: 0.6355 - mean_absolute_error: 0.4461 - val_loss: 0.6416 - val_mean_absolute_error: 0.4302
Epoch 5/7
Epoch 00004: val_loss did not improve
54s - loss: 0.6169 - mean_absolute_error: 0.4398 - val_loss: 0.6672 - val_mean_absolute_error: 0.4464
Epoch 6/7
Epoch 00005: val_loss did not improve
53s - loss: 0.5886 - mean_absolute_error: 0.4359 - val_loss: 0.6483 - val_mean_absolute_error: 0.4270
Epoch 7/7
Epoch 00006: val_loss improved from 0.64164 to 0.64080, saving model to weights.best.rnn_reg.sberbank.hdf5
55s - loss: 0.5888 - mean_absolute_error: 0.4310 - val_loss: 0.6408 - val_mean_absolute_error: 0.4428
In [187]:
loss_plot(rnn_history)
mae_plot(rnn_history)
In [190]:
rnn_model.load_weights('weights.best.rnn_reg.sberbank.hdf5')
y_train_rnn = rnn_model.predict(X_train.reshape(16719, 1, 36))
y_test_rnn = rnn_model.predict(X_test.reshape(4180, 1, 36))

scores('RNN Model; Numeric Features', y_train, y_test, y_train_rnn, y_test_rnn)
_______________________________________
RNN Model; Numeric Features
_______________________________________
EV score. Train:  0.694726872478
EV score. Test:  0.682042054268
---------
R2 score. Train:  0.692969451929
R2 score. Test:  0.679745698991
---------
MSE score. Train:  0.55835947903
MSE score. Test:  0.640798075267
---------
MAE score. Train:  0.430895278251
MAE score. Test:  0.44278362717
---------
MdAE score. Train:  0.231135223971
MdAE score. Test:  0.235191990336
In [191]:
rnn_model.save('kaggle_sberbank_rnn_reg_model.h5')
In [192]:
def rnn_cat_model():
    model = Sequential()
    
    model.add(LSTM(156, return_sequences=True, input_shape=(1, 44)))
    model.add(LSTM(624, return_sequences=False))   
    
    model.add(Dense(1))

    model.compile(optimizer='rmsprop', loss='mse', metrics=['mae'])     
    return model 
In [193]:
rnn_cat_model = rnn_cat_model()
rnn_cat_checkpointer = ModelCheckpoint(filepath='weights.best.rnn_cat_reg.sberbank.hdf5', 
                                        verbose=2, save_best_only=True)

rnn_cat_history = rnn_cat_model.fit(X_train_cat.reshape(16719, 1, 44), y_train_cat.reshape(16719), 
                                    epochs=10, verbose=2, callbacks=[rnn_cat_checkpointer],
                                    validation_data=(X_test_cat.reshape(4180, 1, 44), y_test_cat.reshape(4180)))
Train on 16719 samples, validate on 4180 samples
Epoch 1/10
Epoch 00000: val_loss improved from inf to 0.67045, saving model to weights.best.rnn_cat_reg.sberbank.hdf5
76s - loss: 0.8135 - mean_absolute_error: 0.4995 - val_loss: 0.6705 - val_mean_absolute_error: 0.4532
Epoch 2/10
Epoch 00001: val_loss improved from 0.67045 to 0.65243, saving model to weights.best.rnn_cat_reg.sberbank.hdf5
63s - loss: 0.6709 - mean_absolute_error: 0.4619 - val_loss: 0.6524 - val_mean_absolute_error: 0.4589
Epoch 3/10
Epoch 00002: val_loss improved from 0.65243 to 0.65216, saving model to weights.best.rnn_cat_reg.sberbank.hdf5
63s - loss: 0.6418 - mean_absolute_error: 0.4499 - val_loss: 0.6522 - val_mean_absolute_error: 0.4489
Epoch 4/10
Epoch 00003: val_loss did not improve
62s - loss: 0.6305 - mean_absolute_error: 0.4425 - val_loss: 0.6540 - val_mean_absolute_error: 0.4620
Epoch 5/10
Epoch 00004: val_loss improved from 0.65216 to 0.62885, saving model to weights.best.rnn_cat_reg.sberbank.hdf5
69s - loss: 0.6127 - mean_absolute_error: 0.4374 - val_loss: 0.6289 - val_mean_absolute_error: 0.4366
Epoch 6/10
Epoch 00005: val_loss did not improve
64s - loss: 0.5939 - mean_absolute_error: 0.4337 - val_loss: 0.7236 - val_mean_absolute_error: 0.4687
Epoch 7/10
Epoch 00006: val_loss improved from 0.62885 to 0.62556, saving model to weights.best.rnn_cat_reg.sberbank.hdf5
64s - loss: 0.5909 - mean_absolute_error: 0.4282 - val_loss: 0.6256 - val_mean_absolute_error: 0.4296
Epoch 8/10
Epoch 00007: val_loss did not improve
62s - loss: 0.5756 - mean_absolute_error: 0.4222 - val_loss: 0.7031 - val_mean_absolute_error: 0.4706
Epoch 9/10
Epoch 00008: val_loss did not improve
64s - loss: 0.5622 - mean_absolute_error: 0.4229 - val_loss: 0.6336 - val_mean_absolute_error: 0.4270
Epoch 10/10
Epoch 00009: val_loss did not improve
63s - loss: 0.5488 - mean_absolute_error: 0.4188 - val_loss: 0.7496 - val_mean_absolute_error: 0.4679
In [194]:
loss_plot(rnn_cat_history)
mae_plot(rnn_cat_history)
In [195]:
rnn_cat_model.load_weights('weights.best.rnn_cat_reg.sberbank.hdf5')
y_train_cat_rnn = rnn_cat_model.predict(X_train_cat.reshape(16719, 1, 44))
y_test_cat_rnn = rnn_cat_model.predict(X_test_cat.reshape(4180, 1, 44))

scores('RNN Model; Numeric and Categorical Features', 
       y_train_cat, y_test_cat, y_train_cat_rnn, y_test_cat_rnn)
_______________________________________
RNN Model; Numeric and Categorical Features
_______________________________________
EV score. Train:  0.700834010878
EV score. Test:  0.687540257162
---------
R2 score. Train:  0.700355232858
R2 score. Test:  0.687363071722
---------
MSE score. Train:  0.544927848797
MSE score. Test:  0.62555644457
---------
MAE score. Train:  0.414134848192
MAE score. Test:  0.429569828232
---------
MdAE score. Train:  0.212920162943
MdAE score. Test:  0.217141705534
In [196]:
rnn_cat_model.save('kaggle_sberbank_rnn_cat_reg_model.h5')
In [198]:
def rnn_cat_enc_model():
    model = Sequential()
    
    model.add(LSTM(159, return_sequences=True, input_shape=(1, 636)))
    model.add(LSTM(636, return_sequences=False))   
    
    model.add(Dense(1))

    model.compile(optimizer='rmsprop', loss='mse', metrics=['mae'])     
    return model 
In [199]:
rnn_cat_enc_model = rnn_cat_enc_model()
rnn_cat_enc_checkpointer = ModelCheckpoint(filepath='weights.best.rnn_cat_enc_reg.sberbank.hdf5', 
                                        verbose=2, save_best_only=True)
rnn_cat_enc_history = \
rnn_cat_enc_model.fit(X_train_cat_enc.reshape(16719, 1, 636), y_train_cat_enc.reshape(16719), 
                      epochs=10, verbose=2, callbacks=[rnn_cat_enc_checkpointer],
                      validation_data=(X_test_cat_enc.reshape(4180, 1, 636), y_test_cat_enc.reshape(4180)))
Train on 16719 samples, validate on 4180 samples
Epoch 1/10
Epoch 00000: val_loss improved from inf to 0.74461, saving model to weights.best.rnn_cat_enc_reg.sberbank.hdf5
91s - loss: 0.7763 - mean_absolute_error: 0.4821 - val_loss: 0.7446 - val_mean_absolute_error: 0.4812
Epoch 2/10
Epoch 00001: val_loss improved from 0.74461 to 0.60433, saving model to weights.best.rnn_cat_enc_reg.sberbank.hdf5
80s - loss: 0.6413 - mean_absolute_error: 0.4411 - val_loss: 0.6043 - val_mean_absolute_error: 0.4336
Epoch 3/10
Epoch 00002: val_loss did not improve
71s - loss: 0.5985 - mean_absolute_error: 0.4309 - val_loss: 0.6550 - val_mean_absolute_error: 0.4327
Epoch 4/10
Epoch 00003: val_loss did not improve
71s - loss: 0.5730 - mean_absolute_error: 0.4260 - val_loss: 0.6088 - val_mean_absolute_error: 0.4234
Epoch 5/10
Epoch 00004: val_loss improved from 0.60433 to 0.59502, saving model to weights.best.rnn_cat_enc_reg.sberbank.hdf5
72s - loss: 0.5665 - mean_absolute_error: 0.4182 - val_loss: 0.5950 - val_mean_absolute_error: 0.4304
Epoch 6/10
Epoch 00005: val_loss did not improve
70s - loss: 0.5562 - mean_absolute_error: 0.4153 - val_loss: 0.6166 - val_mean_absolute_error: 0.4242
Epoch 7/10
Epoch 00006: val_loss did not improve
77s - loss: 0.5428 - mean_absolute_error: 0.4136 - val_loss: 0.7043 - val_mean_absolute_error: 0.4502
Epoch 8/10
Epoch 00007: val_loss did not improve
72s - loss: 0.5335 - mean_absolute_error: 0.4107 - val_loss: 0.6105 - val_mean_absolute_error: 0.4261
Epoch 9/10
Epoch 00008: val_loss did not improve
78s - loss: 0.5270 - mean_absolute_error: 0.4066 - val_loss: 0.6433 - val_mean_absolute_error: 0.4426
Epoch 10/10
Epoch 00009: val_loss did not improve
73s - loss: 0.5227 - mean_absolute_error: 0.4058 - val_loss: 0.6455 - val_mean_absolute_error: 0.4258
In [200]:
loss_plot(rnn_cat_enc_history)
mae_plot(rnn_cat_enc_history)
In [201]:
rnn_cat_enc_model.load_weights('weights.best.rnn_cat_enc_reg.sberbank.hdf5')
y_train_cat_enc_rnn = rnn_cat_enc_model.predict(X_train_cat_enc.reshape(16719, 1, 636))
y_test_cat_enc_rnn = rnn_cat_enc_model.predict(X_test_cat_enc.reshape(4180, 1, 636))

scores('RNN Model; Numeric and Encoded Categorical Features', 
       y_train_cat_enc, y_test_cat_enc, y_train_cat_enc_rnn, y_test_cat_enc_rnn)
_______________________________________
RNN Model; Numeric and Encoded Categorical Features
_______________________________________
EV score. Train:  0.71701896004
EV score. Test:  0.703846931267
---------
R2 score. Train:  0.716387701913
R2 score. Test:  0.702624960581
---------
MSE score. Train:  0.515771528277
MSE score. Test:  0.595018871852
---------
MAE score. Train:  0.408076438578
MAE score. Test:  0.430377326061
---------
MdAE score. Train:  0.206323881414
MdAE score. Test:  0.222060045026
In [202]:
rnn_cat_enc_model.save('kaggle_sberbank_rnn_cat_enc_reg_model.h5')

Display Predictions

In [203]:
plt.figure(figsize = (18, 6))

plt.plot(y_test[1:50], color = 'black', label='Real Data')

plt.plot(y_test_mlp[1:50], label='MLP')
plt.plot(y_test_cnn[1:50], label='CNN')
plt.plot(y_test_rnn[1:50], label='RNN')

plt.legend()
plt.title("Numeric Features; Neural Network Predictions vs Real Data");
In [204]:
plt.figure(figsize = (18, 6))

plt.plot(y_test_cat[1:50], color = 'black', label='Real Data')

plt.plot(y_test_cat_mlp[1:50], label='MLP')
plt.plot(y_test_cat_cnn[1:50], label='CNN')
plt.plot(y_test_cat_rnn[1:50], label='RNN')

plt.legend()
plt.title("Numeric and Categorical Features; Neural Network Predictions vs Real Data");
In [205]:
plt.figure(figsize = (18, 6))

plt.plot(y_test_cat[1:50], color = 'black', label='Real Data')

plt.plot(y_test_cat_enc_mlp[1:50], label='MLP')
plt.plot(y_test_cat_enc_cnn[1:50], label='CNN')
plt.plot(y_test_cat_enc_rnn[1:50], label='RNN')

plt.legend()
plt.title("Numeric and Encoded Categorical Features; Neural Network Predictions vs Real Data");

5. Evaluation Metrics

  • explained variance regression score
  • coefficient of determination
  • mean squared error
  • mean absolute error
  • median absolute error
In [206]:
# Scale
target_scale = RobustScaler()
s_target_train = target_scale.fit_transform(target_train.reshape(-1,1))
#########################################################################################
feature_scale = RobustScaler()
s_features_train = feature_scale.fit_transform(features_train)
s_features_test = feature_scale.transform(features_test)
########################################################################################
feature_cat_scale = RobustScaler()
s_features_train_cat = feature_cat_scale.fit_transform(features_train_cat)
s_features_test_cat = feature_cat_scale.transform(features_test_cat)
########################################################################################
feature_cat_enc_scale = RobustScaler()
s_features_train_cat_enc = feature_cat_enc_scale.fit_transform(features_train_cat_enc)
s_features_test_cat_enc = feature_cat_enc_scale.transform(features_test_cat_enc)

5.1 Regressors; Scikit-Learn

Numeric Features

In [207]:
gbr = GradientBoostingRegressor(max_depth=4, n_estimators=360)
gbr.fit(s_features_train, s_target_train)

s_target_train_gbr = gbr.predict(s_features_train)
s_target_test_gbr = gbr.predict(s_features_test)

scores2('Gradient Boosting Regressor', s_target_train, s_target_train_gbr)
_______________________________________
Gradient Boosting Regressor
_______________________________________
EV score: 0.851729559483
---------
R2 score: 0.851729559483
---------
MSE score: 0.273663122104
---------
MAE score: 0.324355312761
---------
MdAE score: 0.17539487972
In [208]:
br = BaggingRegressor(n_estimators=252)
br.fit(s_features_train, s_target_train)

s_target_train_br = br.predict(s_features_train)
s_target_test_br = br.predict(s_features_test)

scores2('Bagging Regressor', s_target_train, s_target_train_br)
_______________________________________
Bagging Regressor
_______________________________________
EV score: 0.958263131758
---------
R2 score: 0.958238545032
---------
MSE score: 0.0770792216591
---------
MAE score: 0.144707485868
---------
MdAE score: 0.0631377765764
In [209]:
s_target_train_mlpr = mlpr.predict(s_features_train)
s_target_test_mlpr = mlpr.predict(s_features_test)

scores2('MLP Regressor', s_target_train, s_target_train_mlpr)
_______________________________________
MLP Regressor
_______________________________________
EV score: 0.702986839727
---------
R2 score: 0.70282819201
---------
MSE score: 0.548490747665
---------
MAE score: 0.421559439291
---------
MdAE score: 0.230532515885

Numeric and Categorical Features

In [210]:
gbr_cat = GradientBoostingRegressor(max_depth=3, n_estimators=396)
gbr_cat.fit(s_features_train_cat, s_target_train)

s_target_train_cat_gbr = gbr_cat.predict(s_features_train_cat)
s_target_test_cat_gbr = gbr_cat.predict(s_features_test_cat)

scores2('Gradient Boosting Regressor', s_target_train, s_target_train_cat_gbr)
_______________________________________
Gradient Boosting Regressor
_______________________________________
EV score: 0.813227343634
---------
R2 score: 0.813227343634
---------
MSE score: 0.344726757987
---------
MAE score: 0.357167721423
---------
MdAE score: 0.190744564286
In [211]:
br_cat = BaggingRegressor(n_estimators=220)
br_cat.fit(s_features_train_cat, s_target_train)

s_target_train_cat_br = br_cat.predict(s_features_train_cat)
s_target_test_cat_br = br_cat.predict(s_features_test_cat)

scores2('Bagging Regressor', s_target_train, s_target_train_cat_br)
_______________________________________
Bagging Regressor
_______________________________________
EV score: 0.958772559127
---------
R2 score: 0.958749967861
---------
MSE score: 0.0761352872667
---------
MAE score: 0.144716277889
---------
MdAE score: 0.0628972347414
In [212]:
s_target_train_cat_mlpr = mlpr_cat.predict(s_features_train_cat)
s_target_test_cat_mlpr = mlpr_cat.predict(s_features_test_cat)

scores2('MLP Regressor', s_target_train, s_target_train_cat_mlpr)
_______________________________________
MLP Regressor
_______________________________________
EV score: 0.713256535113
---------
R2 score: 0.713239004476
---------
MSE score: 0.529275485113
---------
MAE score: 0.417176070834
---------
MdAE score: 0.22864448087

Numeric and Encoded Categorical Features

In [213]:
gbr_cat_enc = GradientBoostingRegressor(max_depth=3, n_estimators=159)
gbr_cat_enc.fit(s_features_train_cat_enc, s_target_train)

s_target_train_cat_enc_gbr = gbr_cat_enc.predict(s_features_train_cat_enc)
s_target_test_cat_enc_gbr = gbr_cat_enc.predict(s_features_test_cat_enc)

scores2('Gradient Boosting Regressor', s_target_train, s_target_train_cat_enc_gbr)
_______________________________________
Gradient Boosting Regressor
_______________________________________
EV score: 0.764000775181
---------
R2 score: 0.764000775181
---------
MSE score: 0.435584358236
---------
MAE score: 0.394758875506
---------
MdAE score: 0.206033263364
In [214]:
br_cat_enc = BaggingRegressor(n_estimators=159)
br_cat_enc.fit(s_features_train_cat_enc, s_target_train)

s_target_train_cat_enc_br = br_cat.predict(s_features_train_cat_enc)
s_target_test_cat_enc_br = br_cat.predict(s_features_test_cat_enc)

scores2('Bagging Regressor', s_target_train, s_target_train_cat_enc_br)
_______________________________________
Bagging Regressor
_______________________________________
EV score: 0.922887812416
---------
R2 score: 0.922796176133
---------
MSE score: 0.142495290389
---------
MAE score: 0.198021437384
---------
MdAE score: 0.0907021129512
In [215]:
s_target_train_cat_enc_mlpr = mlpr_cat_enc.predict(s_features_train_cat_enc)
s_target_test_cat_enc_mlpr = mlpr_cat_enc.predict(s_features_test_cat_enc)

scores2('MLP Regressor', s_target_train, s_target_train_cat_enc_mlpr)
_______________________________________
MLP Regressor
_______________________________________
EV score: 0.743849202665
---------
R2 score: 0.743848544029
---------
MSE score: 0.472779381568
---------
MAE score: 0.404735425541
---------
MdAE score: 0.221238105006

5.2 Neural Networks; Keras

Numeric Features

In [216]:
s_target_train_mlp = mlp_model.predict(s_features_train)
s_target_test_mlp = mlp_model.predict(s_features_test)

scores2('MLP', s_target_train, s_target_train_mlp)
_______________________________________
MLP
_______________________________________
EV score: 0.731482725603
---------
R2 score: 0.731315600285
---------
MSE score: 0.495911467115
---------
MAE score: 0.408692774398
---------
MdAE score: 0.211420994803
In [217]:
s_target_train_cnn = cnn_model.predict(s_features_train.reshape(20899, 36, 1))
s_target_test_cnn = cnn_model.predict(s_features_test.reshape(7662, 36, 1))

scores2('CNN', s_target_train, s_target_train_cnn)
_______________________________________
CNN
_______________________________________
EV score: 0.692675452546
---------
R2 score: 0.69252083441
---------
MSE score: 0.56751506331
---------
MAE score: 0.436886759558
---------
MdAE score: 0.231537122834
In [218]:
s_target_train_rnn = rnn_model.predict(s_features_train.reshape(20899, 1, 36))
s_target_test_rnn = rnn_model.predict(s_features_test.reshape(7662, 1, 36))

scores2('RNN', s_target_train, s_target_train_rnn)
_______________________________________
RNN
_______________________________________
EV score: 0.691963197241
---------
R2 score: 0.689972882334
---------
MSE score: 0.572217824817
---------
MAE score: 0.432831273426
---------
MdAE score: 0.232529085044

Numeric and Categorical Feature

In [219]:
s_target_train_cat_mlp = mlp_cat_model.predict(s_features_train_cat)
s_target_test_cat_mlp = mlp_cat_model.predict(s_features_test_cat)

scores2('MLP', s_target_train, s_target_train_cat_mlp)
_______________________________________
MLP
_______________________________________
EV score: 0.746956332742
---------
R2 score: 0.746351577076
---------
MSE score: 0.468159527226
---------
MAE score: 0.405758841363
---------
MdAE score: 0.215336569143
In [220]:
s_target_train_cat_cnn = cnn_cat_model.predict(s_features_train_cat.reshape(20899, 44, 1))
s_target_test_cat_cnn = cnn_cat_model.predict(s_features_test_cat.reshape(7662, 44, 1))

scores2('CNN', s_target_train, s_target_train_cat_cnn)
_______________________________________
CNN
_______________________________________
EV score: 0.720333313386
---------
R2 score: 0.720327841696
---------
MSE score: 0.516191600565
---------
MAE score: 0.424670528753
---------
MdAE score: 0.227845395163
In [221]:
s_target_train_cat_rnn = rnn_cat_model.predict(s_features_train_cat.reshape(20899, 1, 44))
s_target_test_cat_rnn = rnn_cat_model.predict(s_features_test_cat.reshape(7662, 1, 44))

scores2('RNN', s_target_train, s_target_train_cat_rnn)
_______________________________________
RNN
_______________________________________
EV score: 0.697941603968
---------
R2 score: 0.697569961754
---------
MSE score: 0.55819587637
---------
MAE score: 0.416450430077
---------
MdAE score: 0.213839573101

Numeric and Encoded Categorical Features

In [222]:
s_target_train_cat_enc_mlp = mlp_cat_enc_model.predict(s_features_train_cat_enc)
s_target_test_cat_enc_mlp = mlp_cat_enc_model.predict(s_features_test_cat_enc)

scores2('MLP', s_target_train, s_target_train_cat_enc_mlp)
_______________________________________
MLP
_______________________________________
EV score: 0.737975651529
---------
R2 score: 0.736147962043
---------
MSE score: 0.486992364958
---------
MAE score: 0.414191738172
---------
MdAE score: 0.221306182345
In [223]:
s_target_train_cat_enc_cnn = cnn_cat_enc_model.predict(s_features_train_cat_enc.reshape(20899, 636, 1))
s_target_test_cat_enc_cnn = cnn_cat_enc_model.predict(s_features_test_cat_enc.reshape(7662, 636, 1))

scores2('CNN', s_target_train, s_target_train_cat_enc_cnn)
_______________________________________
CNN
_______________________________________
EV score: 0.714118491154
---------
R2 score: 0.713903410734
---------
MSE score: 0.528049188824
---------
MAE score: 0.428397088825
---------
MdAE score: 0.231605962583
In [224]:
s_target_train_cat_enc_rnn = rnn_cat_enc_model.predict(s_features_train_cat_enc.reshape(20899, 1, 636))
s_target_test_cat_enc_rnn = rnn_cat_enc_model.predict(s_features_test_cat_enc.reshape(7662, 1, 636))

scores2('RNN', s_target_train, s_target_train_cat_enc_rnn)
_______________________________________
RNN
_______________________________________
EV score: 0.714284292098
---------
R2 score: 0.713449953579
---------
MSE score: 0.528886135827
---------
MAE score: 0.412166119379
---------
MdAE score: 0.210303081669

6. Predictions

In [225]:
# Rescale Predictions
target_train_gbr = target_scale.inverse_transform(s_target_train_gbr.reshape(-1,1))
target_test_gbr = target_scale.inverse_transform(s_target_test_gbr.reshape(-1,1))
target_train_br = target_scale.inverse_transform(s_target_train_br.reshape(-1,1))
target_test_br = target_scale.inverse_transform(s_target_test_br.reshape(-1,1))
target_train_mlpr = target_scale.inverse_transform(s_target_train_mlpr.reshape(-1,1))
target_test_mlpr = target_scale.inverse_transform(s_target_test_mlpr.reshape(-1,1))

target_train_mlp = target_scale.inverse_transform(s_target_train_mlp)
target_test_mlp = target_scale.inverse_transform(s_target_test_mlp)
target_train_cnn = target_scale.inverse_transform(s_target_train_cnn)
target_test_cnn = target_scale.inverse_transform(s_target_test_cnn)
target_train_rnn = target_scale.inverse_transform(s_target_train_rnn)
target_test_rnn = target_scale.inverse_transform(s_target_test_rnn)
In [226]:
plt.figure(figsize = (18, 6))

plt.plot(target_train[1:50], color = 'black', label='Real Data')

plt.plot(target_train_gbr[1:50], label='Gradient Boosting Regressor')
plt.plot(target_train_br[1:50], label='Bagging Regressor')
plt.plot(target_train_mlpr[1:50], label='MLP Regressor')

plt.plot(target_train_mlp[1:50], label='MLP')
plt.plot(target_train_cnn[1:50], label='CNN')
plt.plot(target_train_rnn[1:50], label='RNN')

plt.legend()
plt.title("Numeric Features; Train Predictions vs Real Data");
In [227]:
plt.figure(figsize = (18, 6))

plt.plot(target_test_gbr[1:50], label='Gradient Boosting Regressor')
plt.plot(target_test_br[1:50], label='Bagging Regressor')
plt.plot(target_test_mlpr[1:50], label='MLP Regressor')

plt.plot(target_test_mlp[1:50], label='MLP')
plt.plot(target_test_cnn[1:50], label='CNN')
plt.plot(target_test_rnn[1:50], label='RNN')

plt.legend()
plt.title("Numeric Features; Test Predictions");
In [228]:
# Rescale Predictions
target_train_cat_gbr = target_scale.inverse_transform(s_target_train_cat_gbr.reshape(-1,1))
target_test_cat_gbr = target_scale.inverse_transform(s_target_test_cat_gbr.reshape(-1,1))
target_train_cat_br = target_scale.inverse_transform(s_target_train_cat_br.reshape(-1,1))
target_test_cat_br = target_scale.inverse_transform(s_target_test_cat_br.reshape(-1,1))
target_train_cat_mlpr = target_scale.inverse_transform(s_target_train_cat_mlpr.reshape(-1,1))
target_test_cat_mlpr = target_scale.inverse_transform(s_target_test_cat_mlpr.reshape(-1,1))

target_train_cat_mlp = target_scale.inverse_transform(s_target_train_cat_mlp.reshape(-1,1))
target_test_cat_mlp = target_scale.inverse_transform(s_target_test_cat_mlp.reshape(-1,1))
target_train_cat_cnn = target_scale.inverse_transform(s_target_train_cat_cnn.reshape(-1,1))
target_test_cat_cnn = target_scale.inverse_transform(s_target_test_cat_cnn.reshape(-1,1))
target_train_cat_rnn = target_scale.inverse_transform(s_target_train_cat_rnn.reshape(-1,1))
target_test_cat_rnn = target_scale.inverse_transform(s_target_test_cat_rnn.reshape(-1,1))
In [229]:
plt.figure(figsize = (18, 6))

plt.plot(target_train[1:50], color = 'black', label='Real Data')

plt.plot(target_train_cat_gbr[1:50], label='Gradient Boosting Regressor')
plt.plot(target_train_cat_br[1:50], label='Bagging Regressor')
plt.plot(target_train_cat_mlpr[1:50], label='MLP Regressor')

plt.plot(target_train_cat_mlp[1:50], label='MLP')
plt.plot(target_train_cat_cnn[1:50], label='CNN')
plt.plot(target_train_cat_rnn[1:50], label='RNN')

plt.legend()
plt.title("Numeric and Categorical Features; Train Predictions vs Real Data");
In [230]:
plt.figure(figsize = (18, 6))

plt.plot(target_test_cat_gbr[1:50], label='Gradient Boosting Regressor')
plt.plot(target_test_cat_br[1:50], label='Bagging Regressor')
plt.plot(target_test_cat_mlpr[1:50], label='MLP Regressor')

plt.plot(target_test_cat_mlp[1:50], label='MLP')
plt.plot(target_test_cat_cnn[1:50], label='CNN')
plt.plot(target_test_cat_rnn[1:50], label='RNN')

plt.legend()
plt.title("Numeric and Categorical Features; Test Predictions");
In [231]:
# Rescale Predictions
target_train_cat_enc_gbr = target_scale.inverse_transform(s_target_train_cat_enc_gbr.reshape(-1,1))
target_test_cat_enc_gbr = target_scale.inverse_transform(s_target_test_cat_enc_gbr.reshape(-1,1))
target_train_cat_enc_br = target_scale.inverse_transform(s_target_train_cat_enc_br.reshape(-1,1))
target_test_cat_enc_br = target_scale.inverse_transform(s_target_test_cat_enc_br.reshape(-1,1))
target_train_cat_enc_mlpr = target_scale.inverse_transform(s_target_train_cat_enc_mlpr.reshape(-1,1))
target_test_cat_enc_mlpr = target_scale.inverse_transform(s_target_test_cat_enc_mlpr.reshape(-1,1))

target_train_cat_enc_mlp = target_scale.inverse_transform(s_target_train_cat_enc_mlp.reshape(-1,1))
target_test_cat_enc_mlp = target_scale.inverse_transform(s_target_test_cat_enc_mlp.reshape(-1,1))
target_train_cat_enc_cnn = target_scale.inverse_transform(s_target_train_cat_enc_cnn.reshape(-1,1))
target_test_cat_enc_cnn = target_scale.inverse_transform(s_target_test_cat_enc_cnn.reshape(-1,1))
target_train_cat_enc_rnn = target_scale.inverse_transform(s_target_train_cat_enc_rnn.reshape(-1,1))
target_test_cat_enc_rnn = target_scale.inverse_transform(s_target_test_cat_enc_rnn.reshape(-1,1))
In [232]:
plt.figure(figsize = (18, 6))

plt.plot(target_train[1:50], color = 'black', label='Real Data')

plt.plot(target_train_cat_enc_gbr[1:50], label='Gradient Boosting Regressor')
plt.plot(target_train_cat_enc_br[1:50], label='Bagging Regressor')
plt.plot(target_train_cat_enc_mlpr[1:50], label='MLP Regressor')

plt.plot(target_train_cat_enc_mlp[1:50], label='MLP')
plt.plot(target_train_cat_enc_cnn[1:50], label='CNN')
plt.plot(target_train_cat_enc_rnn[1:50], label='RNN')

plt.legend()
plt.title("Numeric and Encoded Categorical Features; Train Predictions vs Real Data");
In [233]:
plt.figure(figsize = (18, 6))

plt.plot(target_test_cat_enc_gbr[1:50], label='Gradient Boosting Regressor')
plt.plot(target_test_cat_enc_br[1:50], label='Bagging Regressor')
plt.plot(target_test_cat_enc_mlpr[1:50], label='MLP Regressor')

plt.plot(target_test_cat_enc_mlp[1:50], label='MLP')
plt.plot(target_test_cat_enc_cnn[1:50], label='CNN')
plt.plot(target_test_cat_enc_rnn[1:50], label='RNN')

plt.legend()
plt.title("Numeric and Encoded Categorical Features; Test Predictions");

7. Submissions

7.1 Regressors; Scikit-Learn

In [235]:
target_gbr = ["{0:.2f}".format(x) for x in target_test_gbr.reshape(-1)]

submission_gbr = pd.DataFrame({"id": test['id'], "price_doc": target_gbr})
print(submission_gbr[0:20])

submission_gbr.to_csv('kaggle_sberbank_gbr.csv', index=False)
       id    price_doc
0   30474   6073792.71
1   30475   8150382.72
2   30476   5818806.06
3   30477   6247885.59
4   30478   5208291.67
5   30479   8991884.77
6   30480   4466193.68
7   30481   4363901.24
8   30482   4435678.92
9   30483   5231236.13
10  30484   6822426.94
11  30485   5208291.67
12  30486   4248343.90
13  30487   4803127.22
14  30488   6059751.66
15  30489   5676785.00
16  30490  21264016.44
17  30491  17213802.65
18  30492   6418769.91
19  30493  17093114.94
In [248]:
target_br = ["{0:.2f}".format(x) for x in target_test_br.reshape(-1)]

submission_br = pd.DataFrame({"id": test['id'], "price_doc": target_br})
print(submission_br[0:20])

submission_br.to_csv('kaggle_sberbank_br.csv', index=False)
       id    price_doc
0   30474   5546637.62
1   30475   8389156.56
2   30476   6497919.23
3   30477   6505779.76
4   30478   5077319.50
5   30479  10094749.65
6   30480   4558864.65
7   30481   4270513.69
8   30482   4626310.55
9   30483   5631038.56
10  30484   6403808.72
11  30485   4947976.00
12  30486   4168202.40
13  30487   4793751.25
14  30488   7940547.17
15  30489   5491109.61
16  30490  21854791.21
17  30491  16957167.63
18  30492   6437148.41
19  30493  14943140.41
In [249]:
target_mlpr = ["{0:.2f}".format(x) for x in target_test_mlpr.reshape(-1)]

submission_mlpr = pd.DataFrame({"id": test['id'], "price_doc": target_mlpr})
print(submission_mlpr[0:20])

submission_mlpr.to_csv('kaggle_sberbank_mlpr.csv', index=False)
       id    price_doc
0   30474   4705974.98
1   30475   9184167.44
2   30476   5184312.84
3   30477   6475784.27
4   30478   5268916.12
5   30479   9286856.66
6   30480   4450959.20
7   30481   4136713.95
8   30482   6293129.28
9   30483   5189490.14
10  30484   6521724.50
11  30485   5217588.22
12  30486   3821173.99
13  30487   6022160.59
14  30488   6666426.42
15  30489   5892976.17
16  30490  22847779.03
17  30491  20607893.20
18  30492   7502682.21
19  30493  17886532.01
In [250]:
target_gbr_cat = ["{0:.2f}".format(x) for x in target_test_cat_gbr.reshape(-1)]

submission_gbr_cat = pd.DataFrame({"id": test['id'], "price_doc": target_gbr_cat})
print(submission_gbr_cat[0:20])

submission_gbr_cat.to_csv('kaggle_sberbank_gbr_cat.csv', index=False)
       id    price_doc
0   30474   5912716.65
1   30475   7974392.88
2   30476   5858493.35
3   30477   6202651.00
4   30478   5364302.58
5   30479   8560863.52
6   30480   4473971.05
7   30481   4407103.48
8   30482   4289849.90
9   30483   5348391.08
10  30484   7028558.14
11  30485   5364302.58
12  30486   4500022.76
13  30487   4887658.76
14  30488   6107215.42
15  30489   5952158.54
16  30490  21162121.03
17  30491  17229893.01
18  30492   6451063.59
19  30493  16228805.00
In [251]:
target_br_cat = ["{0:.2f}".format(x) for x in target_test_cat_br.reshape(-1)]

submission_br_cat = pd.DataFrame({"id": test['id'], "price_doc": target_br_cat})
print(submission_br_cat[0:20])

submission_br_cat.to_csv('kaggle_sberbank_br_cat.csv', index=False)
       id    price_doc
0   30474   5594678.58
1   30475   8413620.53
2   30476   6383869.27
3   30477   6509850.71
4   30478   5104417.55
5   30479  10078368.67
6   30480   4556594.84
7   30481   4276238.23
8   30482   4387281.98
9   30483   5608831.69
10  30484   6381087.54
11  30485   4948949.09
12  30486   4260138.13
13  30487   4697835.21
14  30488   7995266.44
15  30489   5498673.95
16  30490  21964976.19
17  30491  16732463.62
18  30492   6421748.85
19  30493  15471603.40
In [252]:
target_mlpr_cat = ["{0:.2f}".format(x) for x in target_test_cat_mlpr.reshape(-1)]

submission_mlpr_cat = pd.DataFrame({"id": test['id'], "price_doc": target_mlpr_cat})
print(submission_mlpr_cat[0:20])

submission_mlpr_cat.to_csv('kaggle_sberbank_mlpr_cat.csv', index=False)
       id    price_doc
0   30474   4316285.58
1   30475   8518020.89
2   30476   6269820.30
3   30477   5968243.94
4   30478   4885278.68
5   30479   9520833.66
6   30480   4571099.22
7   30481   4337914.05
8   30482   5102844.42
9   30483   5653404.46
10  30484   7305964.04
11  30485   4837580.46
12  30486   5326538.48
13  30487   5011265.80
14  30488   7237371.12
15  30489   6630155.18
16  30490  19326054.49
17  30491  21003431.25
18  30492   6025808.61
19  30493  18032322.45
In [253]:
target_gbr_cat_enc = ["{0:.2f}".format(x) for x in target_test_cat_enc_gbr.reshape(-1)]

submission_gbr_cat_enc = pd.DataFrame({"id": test['id'], "price_doc": target_gbr_cat_enc})
print(submission_gbr_cat_enc[0:20])

submission_gbr_cat_enc.to_csv('kaggle_sberbank_gbr_cat_enc.csv', index=False)
       id    price_doc
0   30474   6021546.75
1   30475   7846955.09
2   30476   5842330.01
3   30477   5744159.23
4   30478   5585856.24
5   30479   7809837.60
6   30480   6629326.76
7   30481   4292011.59
8   30482   5879696.92
9   30483   4854506.26
10  30484   6917497.88
11  30485   5585856.24
12  30486   4560513.45
13  30487   5109981.63
14  30488   6180068.25
15  30489   5878001.35
16  30490  20322785.71
17  30491  17050385.04
18  30492   6625461.06
19  30493  15427405.28
In [254]:
target_br_cat_enc = ["{0:.2f}".format(x) for x in target_test_cat_enc_br.reshape(-1)]

submission_br_cat_enc = pd.DataFrame({"id": test['id'], "price_doc": target_br_cat_enc})
print(submission_br_cat_enc[0:20])

submission_br_cat_enc.to_csv('kaggle_sberbank_br_cat_enc.csv', index=False)
       id    price_doc
0   30474   5343920.29
1   30475   8502249.23
2   30476   6108376.32
3   30477   6476850.14
4   30478   5088214.48
5   30479   9379233.07
6   30480   4577249.91
7   30481   4274650.06
8   30482   4627767.35
9   30483   5499577.82
10  30484   6373422.31
11  30485   4953071.04
12  30486   4227618.70
13  30487   4729199.63
14  30488   7957606.93
15  30489   5506495.08
16  30490  21311725.65
17  30491  16633102.88
18  30492   6470016.16
19  30493  15252160.56
In [255]:
target_mlpr_cat_enc = ["{0:.2f}".format(x) for x in target_test_cat_enc_mlpr.reshape(-1)]

submission_mlpr_cat_enc = pd.DataFrame({"id": test['id'], "price_doc": target_mlpr_cat_enc})
print(submission_mlpr_cat_enc[0:20])

submission_mlpr_cat_enc.to_csv('kaggle_sberbank_mlpr_cat_enc.csv', index=False)
       id    price_doc
0   30474   4190365.98
1   30475   7896089.61
2   30476   4666784.71
3   30477   4815658.49
4   30478   4699214.39
5   30479   8488879.89
6   30480   7279627.18
7   30481   5182556.71
8   30482   7182237.62
9   30483   4522415.81
10  30484   7057664.42
11  30485   4632050.04
12  30486   4097216.33
13  30487   6144704.21
14  30488   6649176.93
15  30489   7088986.46
16  30490  26770786.53
17  30491  18216045.66
18  30492   6851392.61
19  30493  17377842.48

7.2 Neural Networks; Keras

In [236]:
target_mlp = ["{0:.2f}".format(x) for x in target_test_mlp.reshape(-1)]

submission_mlp = pd.DataFrame({"id": test['id'], "price_doc": target_mlp})
print(submission_mlp[0:20])

submission_mlp.to_csv('kaggle_sberbank_mlp.csv', index=False)
       id    price_doc
0   30474   6034118.50
1   30475   8600627.00
2   30476   5374775.50
3   30477   6132221.00
4   30478   5029430.00
5   30479   8166303.00
6   30480   4512722.50
7   30481   4243866.00
8   30482   4947822.00
9   30483   4780154.00
10  30484   6171841.00
11  30485   4982720.00
12  30486   4366005.50
13  30487   4845664.00
14  30488   5501534.50
15  30489   5671823.00
16  30490  28117044.00
17  30491  21374712.00
18  30492   6601480.00
19  30493  18378118.00
In [237]:
target_cnn = ["{0:.2f}".format(x) for x in target_test_cnn.reshape(-1)]

submission_cnn = pd.DataFrame({"id": test['id'], "price_doc": target_cnn})
print(submission_cnn[0:20])

submission_cnn.to_csv('kaggle_sberbank_cnn.csv', index=False)
       id    price_doc
0   30474   5724029.50
1   30475   8732130.00
2   30476   5307823.00
3   30477   6228442.50
4   30478   5446590.00
5   30479   7590949.50
6   30480   4038358.50
7   30481   4871084.50
8   30482   6012047.50
9   30483   5159022.00
10  30484   6022114.50
11  30485   5427487.50
12  30486   4679518.00
13  30487   5347206.50
14  30488   5461732.50
15  30489   5715214.00
16  30490  21371960.00
17  30491  22899156.00
18  30492   7000611.50
19  30493  17144310.00
In [238]:
target_rnn = ["{0:.2f}".format(x) for x in target_test_rnn.reshape(-1)]

submission_rnn = pd.DataFrame({"id": test['id'], "price_doc": target_cnn})
print(submission_rnn[0:20])

submission_rnn.to_csv('kaggle_sberbank_rnn.csv', index=False)
       id    price_doc
0   30474   5724029.50
1   30475   8732130.00
2   30476   5307823.00
3   30477   6228442.50
4   30478   5446590.00
5   30479   7590949.50
6   30480   4038358.50
7   30481   4871084.50
8   30482   6012047.50
9   30483   5159022.00
10  30484   6022114.50
11  30485   5427487.50
12  30486   4679518.00
13  30487   5347206.50
14  30488   5461732.50
15  30489   5715214.00
16  30490  21371960.00
17  30491  22899156.00
18  30492   7000611.50
19  30493  17144310.00
In [240]:
target_mlp_cat = ["{0:.2f}".format(x) for x in target_test_cat_mlp.reshape(-1)]

submission_mlp_cat = pd.DataFrame({"id": test['id'], "price_doc": target_mlp_cat})
print(submission_mlp_cat[0:20])

submission_mlp_cat.to_csv('kaggle_sberbank_mlp_cat.csv', index=False)
       id    price_doc
0   30474   5368882.50
1   30475   8719584.00
2   30476   5662475.50
3   30477   5832479.00
4   30478   5217242.00
5   30479   9250640.00
6   30480   4885286.00
7   30481   4397383.00
8   30482   5147554.00
9   30483   4808095.50
10  30484   6210111.50
11  30485   5182556.50
12  30486   4494641.50
13  30487   4965736.00
14  30488   5360918.00
15  30489   5370358.00
16  30490  21378492.00
17  30491  18097152.00
18  30492   6708410.00
19  30493  16989292.00
In [242]:
target_cnn_cat = ["{0:.2f}".format(x) for x in target_test_cat_cnn.reshape(-1)]

submission_cnn_cat = pd.DataFrame({"id": test['id'], "price_doc": target_cnn_cat})
print(submission_cnn_cat[0:20])

submission_cnn_cat.to_csv('kaggle_sberbank_cnn_cat.csv', index=False)
       id    price_doc
0   30474   5502717.00
1   30475   8401641.00
2   30476   5808715.00
3   30477   6648018.00
4   30478   5411799.00
5   30479   8352609.50
6   30480   4122066.75
7   30481   5215024.00
8   30482   5764626.50
9   30483   5283518.00
10  30484   6360740.50
11  30485   5393831.00
12  30486   4922319.50
13  30487   5379418.50
14  30488   5615201.50
15  30489   5982559.00
16  30490  21974796.00
17  30491  21080468.00
18  30492   6670079.00
19  30493  16868032.00
In [243]:
target_rnn_cat = ["{0:.2f}".format(x) for x in target_test_cat_rnn.reshape(-1)]

submission_rnn_cat = pd.DataFrame({"id": test['id'], "price_doc": target_cnn_cat})
print(submission_rnn_cat[0:20])

submission_rnn_cat.to_csv('kaggle_sberbank_rnn_cat.csv', index=False)
       id    price_doc
0   30474   5502717.00
1   30475   8401641.00
2   30476   5808715.00
3   30477   6648018.00
4   30478   5411799.00
5   30479   8352609.50
6   30480   4122066.75
7   30481   5215024.00
8   30482   5764626.50
9   30483   5283518.00
10  30484   6360740.50
11  30485   5393831.00
12  30486   4922319.50
13  30487   5379418.50
14  30488   5615201.50
15  30489   5982559.00
16  30490  21974796.00
17  30491  21080468.00
18  30492   6670079.00
19  30493  16868032.00
In [244]:
target_mlp_cat_enc = ["{0:.2f}".format(x) for x in target_test_cat_enc_mlp.reshape(-1)]

submission_mlp_cat_enc = pd.DataFrame({"id": test['id'], "price_doc": target_mlp_cat_enc})
print(submission_mlp_cat_enc[0:20])

submission_mlp_cat_enc.to_csv('kaggle_sberbank_mlp_cat_enc.csv', index=False)
       id    price_doc
0   30474   5282548.00
1   30475   7961081.00
2   30476   6469610.50
3   30477   6125334.00
4   30478   5686417.00
5   30479   6954323.50
6   30480   5428550.00
7   30481   5196612.50
8   30482   5581097.00
9   30483   5433756.00
10  30484   6078440.00
11  30485   5657090.00
12  30486   4949242.00
13  30487   5791152.00
14  30488   6016482.00
15  30489   6768275.00
16  30490  28411532.00
17  30491  14169335.00
18  30492   6964734.50
19  30493  14107480.00
In [246]:
target_cnn_cat_enc = ["{0:.2f}".format(x) for x in target_test_cat_enc_cnn.reshape(-1)]

submission_cnn_cat_enc = pd.DataFrame({"id": test['id'], "price_doc": target_cnn_cat_enc})
print(submission_cnn_cat_enc[0:20])

submission_cnn_cat_enc.to_csv('kaggle_sberbank_cnn_cat_enc.csv', index=False)
       id    price_doc
0   30474   5246308.50
1   30475   8571797.00
2   30476   6716764.00
3   30477   7142270.00
4   30478   6176451.50
5   30479   7352989.00
6   30480   6200686.00
7   30481   5613391.00
8   30482   6738643.00
9   30483   6178559.50
10  30484   7562679.50
11  30485   6156078.00
12  30486   5652129.00
13  30487   7042515.00
14  30488   6916156.50
15  30489   6547114.00
16  30490  26010980.00
17  30491  22164580.00
18  30492   9143826.00
19  30493  18696080.00
In [247]:
target_rnn_cat_enc = ["{0:.2f}".format(x) for x in target_test_cat_enc_rnn.reshape(-1)]

submission_rnn_cat_enc = pd.DataFrame({"id": test['id'], "price_doc": target_cnn_cat_enc})
print(submission_rnn_cat_enc[0:20])

submission_rnn_cat_enc.to_csv('kaggle_sberbank_rnn_cat_enc.csv', index=False)
       id    price_doc
0   30474   5246308.50
1   30475   8571797.00
2   30476   6716764.00
3   30477   7142270.00
4   30478   6176451.50
5   30479   7352989.00
6   30480   6200686.00
7   30481   5613391.00
8   30482   6738643.00
9   30483   6178559.50
10  30484   7562679.50
11  30485   6156078.00
12  30486   5652129.00
13  30487   7042515.00
14  30488   6916156.50
15  30489   6547114.00
16  30490  26010980.00
17  30491  22164580.00
18  30492   9143826.00
19  30493  18696080.00